1 Aim of the analysis

Aim of this analysis is to identiy processing sites (PSS) which are dependent on RNase E 5’ sensing by comparing PSS detected in two strains: rne(5p) and rne(WT). Also, PSS affected by overexpressing RNase E and RNase HII will be identified by comparing rne(WT) with wild-type Synechocystis (WT). The respective sets of TSS will be further analysed, e.g. for secondary structures. rne(WT) will partially be reffered to as “dWT”, rne(5p) as “dTV” or “TV”.

#import gff for anno-TSS
anno_TSS <- unique(rtracklayer::import("input/Kopf_TSS.gff")) # 14 int-TSS are duplicates and are also annotated as alt-TSS
features <- rtracklayer::import("input/20210217_syne_onlyUnique_withFeat.gff3")
TUs <- rtracklayer::import("input/Kopf_4091_TUs_combined.gff3")
# import PSS and TSS identified in TIER-seq analysis
TIERseq_PSS_TSS <- rtracklayer::import("input/TSS_PSS_rneAnalysis_all.gff")

2 Identification and characterization of TSS and PSS sites

2.1 Identification of TSS and PSS using edgeR and DESeq2

Input are tables of PSS and TSS 5’ end counts prepared using a workflow on usegalaxy.eu and a python script. Sequencing data was created from libraries prepared similar to the protocol described in Innocenti et al. (2015) (tagRNA-Seq).

PSS_raw <- read.delim("input/PSS_5ends_combined_5sensing.txt", header=TRUE, row.names=1)
names(PSS_raw) <- paste(names(PSS_raw), "_PSS", sep="")
TSS_raw <- read.delim("input/TSS_5ends_combined_5sensing.txt", header=TRUE, row.names=1)
names(TSS_raw) <- paste(names(TSS_raw), "_TSS", sep="")

# merge PSS and TSS to merged_raw
merged_raw <- merge(PSS_raw, TSS_raw, by="row.names")
row.names(merged_raw) <- merged_raw$Row.names
merged_raw$Row.names <- NULL
rm(PSS_raw)
rm(TSS_raw)

When taking into account lowly populated sites, DESeq2 dispersion estimates become over-fitted. Hence, edgeR::filterByExpression() is used as a pre-processing step before further DESeq2 analyses.

group=c(rep("dWT_PSS", 3), rep("WT_PSS", 3), rep("TV_PSS", 2), rep("dWT_TSS", 3), rep("WT_TSS", 3), rep("TV_TSS", 2))
y <- DGEList(counts=merged_raw, group=group)
nrow(y)
## [1] 7894038
keep <- filterByExpr(y)
y <- y[keep, ,keep.lib.size=FALSE] #reduces from 7,894,038 positions to 15,721
nrow(y)
## [1] 15721
merged_filtered <- merged_raw[row.names(y$counts),]
nrow(merged_filtered)
## [1] 15721
rm(merged_raw)

Since TSS libraries are generally larger than PSS libraries, size factors are determined separately for the different data set types. Size factors for TSS and PSS correlate quite well for different samples.

coldata_PSS <- read.csv("input/colData_PSS.csv", row.names=1)
coldata_TSS <- read.csv("input/colData_TSS.csv", row.names=1)

# create DESeq2 data object
ddsMat_PSS <- DESeqDataSetFromMatrix(countData = merged_filtered[,1:8],
                                 colData = coldata_PSS,
                                 design = ~ strain)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_TSS <- DESeqDataSetFromMatrix(countData = merged_filtered[,9:16],
                                 colData = coldata_TSS,
                                 design = ~ strain)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# run DESeq
ddsMat_PSS <- estimateSizeFactors(ddsMat_PSS) 
ddsMat_TSS <- estimateSizeFactors(ddsMat_TSS) 

# factors needed for creation of .grp file for multireads data
write.csv(data.frame(factor=ddsMat_PSS$sizeFactor), file="output/PSS_sizeFactors.csv")
write.csv(data.frame(factor=ddsMat_TSS$sizeFactor), file="output/TSS_sizeFactors.csv")

sizeFact <- c(ddsMat_PSS$sizeFactor, ddsMat_TSS$sizeFactor)
plot(sizeFact[1:8], sizeFact[9:16], xlab="Size Factors PSS", ylab="Size Factors TSS")

If .grp were not yet created, create .grp files by setting if() to TRUE (! takes some time !):

if(FALSE){
PSS_raw <- read.delim("input/PSS_5ends_combined_5sensing.txt", header=TRUE, row.names=1)
names(PSS_raw) <- paste(names(PSS_raw), "_PSS", sep="")

ddsMat_PSS_2 <- DESeqDataSetFromMatrix(countData = PSS_raw,
                                 colData = coldata_PSS,
                                 design = ~ strain)
ddsMat_PSS_2$sizeFactor <- ddsMat_PSS$sizeFactor

PSS_norm_counts <- counts(ddsMat_PSS_2, normalized=TRUE)
rm(ddsMat_PSS_2)

grp_File <- data.frame("WT"=apply(PSS_norm_counts[,c("WT1_PSS", "WT2_PSS", "WT3_PSS")],1,mean), "dWT"=apply(PSS_norm_counts[,c("dWT1_PSS", "dWT2_PSS", "dWT3_PSS")],1,mean), "TV"=apply(PSS_norm_counts[,c("TV1_PSS", "TV2_PSS")],1,mean))
 
grp_File$seqname <- rep(NA, length(grp_File$WT))
seqnames <- c("BA000022.2", "AP004310.1", "AP004311.1", "AP004312.1", "AP006585.1")
for(i in seqnames){
  grp_File[which(grepl(i, row.names(grp_File))),]$seqname <- i
}
grp_File$strand <- rep(NA, length(grp_File$WT))
for(i in c("plus", "minus")){
  grp_File[which(grepl(i, row.names(grp_File))),]$strand <- i
}

for(i in seqnames){
  for(j in c("plus", "minus")){
    tmp <- subset(grp_File, grp_File$seqname==i & grp_File$strand==j)[,c("WT", "dWT", "TV")]
    s=""
    if(j=="plus"){
      s="fw"
    }else{
      s="rev"
   }
    write.table(tmp, file=paste("output/grp/PSS/", i, "_PSS_WT-dWT-TV_", s, ".grp", sep=""), sep="\t", row.names=FALSE, col.names = FALSE)
  }
}
  
}
if(FALSE){
TSS_raw <- read.delim("input/TSS_5ends_combined_5sensing.txt", header=TRUE, row.names=1)
names(TSS_raw) <- paste(names(TSS_raw), "_TSS", sep="")

ddsMat_TSS_2 <- DESeqDataSetFromMatrix(countData = TSS_raw,
                                 colData = coldata_TSS,
                                 design = ~ strain)
ddsMat_TSS_2$sizeFactor <- ddsMat_TSS$sizeFactor

TSS_norm_counts <- counts(ddsMat_TSS_2, normalized=TRUE)
rm(ddsMat_TSS_2)

grp_File <- data.frame("WT"=apply(TSS_norm_counts[,c("WT1_TSS", "WT2_TSS", "WT3_TSS")],1,mean), "dWT"=apply(TSS_norm_counts[,c("dWT1_TSS", "dWT2_TSS", "dWT3_TSS")],1,mean), "TV"=apply(TSS_norm_counts[,c("TV1_TSS", "TV2_TSS")],1,mean))
 
grp_File$seqname <- rep(NA, length(grp_File$WT))
seqnames <- c("BA000022.2", "AP004310.1", "AP004311.1", "AP004312.1", "AP006585.1")
for(i in seqnames){
  grp_File[which(grepl(i, row.names(grp_File))),]$seqname <- i
}
grp_File$strand <- rep(NA, length(grp_File$WT))
for(i in c("plus", "minus")){
  grp_File[which(grepl(i, row.names(grp_File))),]$strand <- i
}

for(i in seqnames){
  for(j in c("plus", "minus")){
    tmp <- subset(grp_File, grp_File$seqname==i & grp_File$strand==j)[,c("WT", "dWT", "TV")]
    s=""
    if(j=="plus"){
      s="fw"
    }else{
      s="rev"
   }
    write.table(tmp, file=paste("output/grp/TSS/", i, "_TSS_WT-dWT-TV_", s, ".grp", sep=""), sep="\t", row.names=FALSE, col.names = FALSE)
  }
}
  
}

DESeq2 is used to determine which positions are enriched in the TSS and PSS libraries, respectively. Size factors determined above are used.

coldata <- read.csv("input/colData_PSS-TSS.csv", row.names=1)

# create DESeq2 data object
ddsMat_PSSTSS <- DESeqDataSetFromMatrix(countData = merged_filtered,
                                 colData = coldata,
                                 design = ~ strain + type)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_PSSTSS$sizeFactor <- sizeFact
ddsMat_PSSTSS <- DESeq(ddsMat_PSSTSS)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_PSSTSS <- results(ddsMat_PSSTSS, contrast=c("type", "PSS", "TSS"))
write.csv(res_PSSTSS[order(res_PSSTSS$padj),], file="output/DESeq2_resultsTables/results_PSS-TSS-copmarisons.csv")

As a next stop, a first unfiltered set of potential PSS and TSS positions is created:

PSS_positions_unfiltered <- row.names(subset(res_PSSTSS, res_PSSTSS$log2FoldChange>0.8 & res_PSSTSS$padj<0.05))
TSS_positions_unfiltered <- row.names(subset(res_PSSTSS, res_PSSTSS$log2FoldChange<(-0.8) & res_PSSTSS$padj<0.05))

Create GRanges object for PSS data:

PSS_nonReduced_Ranges <- create_GRanges_object(base::strsplit(PSS_positions_unfiltered, "-"), res_PSSTSS[PSS_positions_unfiltered,]$baseMean)
PSS_nonReduced_Ranges$name <- PSS_positions_unfiltered
PSS_nonReduced_Ranges$type <- "PSS"

2.1.1 Diagnostic Plots

pdf(file="output/DESeq2_Plots/ddsMat_PSSTSS_DispEsts.pdf", width=4.5, height=4.5)
plotDispEsts(ddsMat_PSSTSS, xlab="Mean of Normalized Counts", ylab="Dispersion")
dev.off()
## png 
##   2
plotDispEsts(ddsMat_PSSTSS)

PSS and TSS data sets are separated on principal component 1.

p <- PCA_plot_PSSTSS(ddsMat_PSSTSS, "PSS-TSS")
p

ggsave("output/DESeq2_Plots/ddsMat_PSSTSS_PCA.pdf", plot=p, width=15, height=9, units="cm")

p <- heatmap_plot_PSSTSS(ddsMat_PSSTSS, "PSS-TSS")
p

ggsave("output/DESeq2_Plots/ddsMat_PSSTSS_heatMap.pdf", plot=p, width=15, height=12, units="cm")

Comparing PSS and TSS using DESeq2 yields a bimodal log2foldchange distribution. This underlines that PSS and TSS are quite well discernible using the chosen approach. This log2FC distribution is also observable in the non-normalized count data. This is further corroborated by the MAplot and also regarding p-values etc. (see below: Volcano plot and p-value-plot).

pdf(file="output/DESeq2_Plots/ddsMat_PSSTSS_hist_log2FC.pdf", width=4.5, height=4.5)
hist(res_PSSTSS$log2FoldChange, breaks=20, main="", xlab="Log2FC(PSS/TSS)", ylab="Frequency")
dev.off()
## png 
##   2
hist(res_PSSTSS$log2FoldChange, breaks=20, main="DESeq2 Normalization", xlab="Log2FC PSS/TSS", ylab="Frequency")

# non-normalized
log2_all <- log2(apply(merged_filtered[,1:8],1, mean)/apply(merged_filtered[,9:16],1,mean))
hist(log2_all, breaks=20, main="No normalization", xlab="Log2(PSS/TSS)")

p <- MAplot_ggplot(res_PSSTSS, foldchange=0.8, y_axis_label = "Log2 fold-change(PSS/TSS)")
p <- p + scale_colour_manual(values=c("UP"="#4e9879ff", "DOWN"="#9b511fff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p

ggsave("output/DESeq2_Plots/ddsMat_PSSTSS_MAplot.pdf",plot=p, width=15, height=12, units="cm")
res_PSSTSS <- subset(res_PSSTSS, !is.na(res_PSSTSS$padj))

# down: TSS, up: PSS
count_up_down(res_PSSTSS, foldchange=0.8, padjusted=0.05)
## [1] "number of features down:  8841"
## [1] "number of features up:  4632"
p <- volcanoPlot_ggplot(res_PSSTSS, foldchange=0.8, padjusted=0.05) + scale_colour_manual(values=c("UP"="#4e9879ff", "DOWN"="#9b511fff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p

ggsave("output/DESeq2_Plots/ddsMat_PSSTSS_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")

pdf(file="output/DESeq2_Plots/ddsMat_PSSTSS_pValuePlot.pdf", width=4.5, height=4.5)
pvaluePlot(res_PSSTSS, "Comparison PSS to TSS")
dev.off()
## png 
##   2
pvaluePlot(res_PSSTSS, "Comparison PSS to TSS")

2.1.2 Filter potential TSS-positions for positions with (TSS-reads)/(transcript-coverage)>0.02 and <2.0

By inspecting the identified TSS, we observed that a high number of false-positive TSS positions are present in highly expressed genes. To control for this, we control for TSS with a ratio (number TSS-reads)/(number transcript reads) above a certain threshold. To determine this cut-off, we manually inspected highly expressed genomic positions such as the 5’ UTR of psbA2, ncr0700, ssrA and CRISPR3. For instance for psbA2, the actual TSS are well established by independent experiments (Sakurai et al. (2012)). 2% seemed to be a good cut-off for this control.

Further, we introduced a cut-off to filter against TSS without associated transcription. The used library preparation protocol does not capture sRNAs with a length below 200nt and TSS without accompanying transcription might be TSS of sRNAs. However, we assume a high percentage of TSS which were identified without associated transcription are false positives, e.g. created by reverse transcription artefacts. An example is the second TSS upstream of PsbA2R. The actual TSS of the transcript was determined in Sakurai et al. (2012). We decided to filter out TSS for which the ratio (number TSS-reads)/(number transcript coverage) > 200%.

First, transcript coverage data has to be read in and normalized:

transcript_coverage <- read.delim("input/transcript_coverage_combined_5sensing.txt", header=TRUE, row.names=1)

group=c(rep("dWT", 3), rep("WT", 3), rep("TV", 2))
y <- DGEList(counts=transcript_coverage, group=group)
nrow(y)
## [1] 7894038
keep <- filterByExpr(y)
y <- y[keep, ,keep.lib.size=FALSE] #reduces from 7,894,038 positions to 4,017,775
nrow(y)
## [1] 4017775
transcript_coverage_filt <- transcript_coverage[row.names(y$counts),]
rm(transcript_coverage) # remove to make space 

coldata <- read.csv("input/colData_transcript.csv", row.names=1)

# create DESeq2 data object
ddsMat_transc <- DESeqDataSetFromMatrix(countData = transcript_coverage_filt,
                                 colData = coldata,
                                 design = ~ strain)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_transc <- estimateSizeFactors(ddsMat_transc)
trans_norm <- as.data.frame(counts(ddsMat_transc, normalized=TRUE))
TSS_norm <- as.data.frame(counts(ddsMat_TSS, normalized=TRUE))

Do actual filtering and select TSS which are located within a distance of 20nt of annotated TSS by Kopf et al. (2014). Save those TSS combined with the PSS determined as part of this analysis as .gff

TSS_trans_ratio_cutoff <- 0.02
upper_TSS_trans_ratio_cutoff <- 2.00
TSS_trans_ratio <- apply(TSS_norm[TSS_positions_unfiltered,],1,mean)/apply(trans_norm[TSS_positions_unfiltered,],1,mean)
TSS_above_cutoff <- subset(TSS_positions_unfiltered, TSS_trans_ratio[TSS_positions_unfiltered]>TSS_trans_ratio_cutoff & TSS_trans_ratio[TSS_positions_unfiltered]<upper_TSS_trans_ratio_cutoff)
TSS_nonReduced_Ranges <- create_GRanges_object(base::strsplit(TSS_above_cutoff, "-"), res_PSSTSS[TSS_above_cutoff,]$baseMean)
TSS_nonReduced_Ranges$name <- TSS_above_cutoff
highly_prob_TSS <- TSS_nonReduced_Ranges[which(countOverlaps(TSS_nonReduced_Ranges, anno_TSS, maxgap=20)>0)]
names_prob_TSS <- highly_prob_TSS$name
length(reduce(highly_prob_TSS))
## [1] 2863
highly_prob_TSS$type <- "TSS"
save_gff(c(highly_prob_TSS,PSS_nonReduced_Ranges), "output/annoTSS_comparison/gffs/only5sensing/", "TSS_PSS_only5sensing")

2.2 Combining TSS and PSS identified here with TSS, PSS from TIER-seq analysis

TSS_TIERseq <- subset(TIERseq_PSS_TSS, TIERseq_PSS_TSS$type=="TSS")
PSS_TIERseq <- subset(TIERseq_PSS_TSS, TIERseq_PSS_TSS$type=="PSS")
PSS_combined <- unique(c(PSS_nonReduced_Ranges, PSS_TIERseq))
length(PSS_combined)
## [1] 6490
length(PSS_combined)/length(c(PSS_TIERseq,PSS_nonReduced_Ranges))
## [1] 0.8030191
PSS_combined$strand_ident <- NA
PSS_combined[strand(PSS_combined)=="+",]$strand_ident <- "plus"
PSS_combined[strand(PSS_combined)=="-",]$strand_ident <- "minus"

PSS_positions_combined <- paste(seqnames(PSS_combined), start(PSS_combined), PSS_combined$strand_ident, sep="-")
TSS_combined <- unique(c(highly_prob_TSS, TSS_TIERseq))
length(TSS_combined)
## [1] 4440
length(TSS_combined)/length(c(highly_prob_TSS, TSS_TIERseq))
## [1] 0.6357388
TSS_combined$strand_ident <- NA
TSS_combined[strand(TSS_combined)=="+",]$strand_ident <- "plus"
TSS_combined[strand(TSS_combined)=="-",]$strand_ident <- "minus"

TSS_positions_combined <- paste(seqnames(TSS_combined), start(TSS_combined), TSS_combined$strand_ident, sep="-")
TSS_combined$type <- "TSS"
PSS_combined$type <- "PSS"
save_gff(c(TSS_combined, PSS_combined), "output/annoTSS_comparison/gffs/combined/", "TSS_PSS_combined")

2.2.1 Compare positions which will be used in analysis with anno-TSS

PSS_positions_Ranges <- GenomicRanges::reduce(PSS_combined)
length(PSS_positions_Ranges)
## [1] 5429
TSS_positions_Ranges <- GenomicRanges::reduce(TSS_combined)
length(TSS_positions_Ranges)
## [1] 3511
p <- ggplot(data=as.data.frame(PSS_positions_Ranges), aes(x=width)) + geom_histogram(fill="darkgray", binwidth=1, color="darkgray") + theme_light() + xlab("Width of reported PSS") + ylab("Number of cases") + scale_x_continuous(breaks=c(1,2,3,4,5,6,7,8,9,10))
p 

ggsave("output/annoTSS_comparison/hist_width_reported-PSS.pdf", plot=p, width=9, height=9, units="cm")

p <- ggplot(data=as.data.frame(TSS_positions_Ranges), aes(x=width)) + geom_histogram(fill="darkgray", binwidth=1, color="darkgray") + theme_light() + xlab("Width of reported TSS") + ylab("Number of cases") + scale_x_continuous(breaks=c(1,2,3,4,5,6,7,8,9,10), limits=c(0,10))
p 
## Warning: Removed 2 rows containing missing values (geom_bar).

ggsave("output/annoTSS_comparison/hist_width_reported-TSS.pdf", plot=p, width=9, height=9, units="cm")
## Warning: Removed 2 rows containing missing values (geom_bar).
# prepare data.frame to count numbers of TSS, PSS overlapping with anno-TSS
df_overlap_TSSPSS <- as.data.frame(cbind(type=c(rep("PSS", 4), rep("TSS",4)), TSS=c(rep(c("start", "alt", "int", "i-start"), 2))))
df_overlap_TSSPSS$number <- NA

## count TSS, PSS overlapping with certain anno-TSS
# overlap of start anno_TSS _without_ TU_type: i with PSS, TSS
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="PSS" & df_overlap_TSSPSS$TSS=="start",3] <- sum(countOverlaps(PSS_positions_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="start" & !grepl("i", anno_TSS$TU_type, fixed = TRUE))))

df_overlap_TSSPSS[df_overlap_TSSPSS$type=="TSS" & df_overlap_TSSPSS$TSS=="start",3] <- sum(countOverlaps(TSS_positions_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="start" & !grepl("i", anno_TSS$TU_type, fixed = TRUE))))

# overlap of start anno_TSS _with only_ TU_type: i with PSS, TSS
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="PSS" & df_overlap_TSSPSS$TSS=="i-start",3] <- sum(countOverlaps(PSS_positions_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="start" & grepl("i", anno_TSS$TU_type, fixed = TRUE))))

df_overlap_TSSPSS[df_overlap_TSSPSS$type=="TSS" & df_overlap_TSSPSS$TSS=="i-start",3] <- sum(countOverlaps(TSS_positions_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="start" & grepl("i", anno_TSS$TU_type, fixed = TRUE))))

# overlap of alt anno_TSS with PSS, TSS
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="PSS" & df_overlap_TSSPSS$TSS=="alt",3] <- sum(countOverlaps(PSS_positions_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="alt")))
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="TSS" & df_overlap_TSSPSS$TSS=="alt",3] <- sum(countOverlaps(TSS_positions_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="alt")))

# overlap of int anno_TSS with PSS, TSS
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="PSS" & df_overlap_TSSPSS$TSS=="int",3] <- sum(countOverlaps(PSS_positions_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="int")))
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="TSS" & df_overlap_TSSPSS$TSS=="int",3] <- sum(countOverlaps(TSS_positions_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="int")))

#control
sum((countOverlaps(PSS_positions_Ranges, anno_TSS)>0))
## [1] 55
sum(df_overlap_TSSPSS[df_overlap_TSSPSS$type=="PSS",]$number)
## [1] 55
sum((countOverlaps(TSS_positions_Ranges,anno_TSS)>0))
## [1] 2485
sum((countOverlaps(anno_TSS,TSS_positions_Ranges)>0)) # problem: anno_TSS are not unique - some alt and int TSS overlap. 
## [1] 2518
sum(df_overlap_TSSPSS[df_overlap_TSSPSS$type=="TSS",]$number)
## [1] 2518

The majority of anno-TSS which overlap with PSS are not canonical start TSS, but actual int or alt TSS.

p <- ggplot(data=df_overlap_TSSPSS, aes(x=type, y=number, fill=TSS)) + geom_bar(position="fill", stat="identity") + theme_light()+ xlab("") + ylab("Percentage") + scale_fill_grey(start = .2, end = .9)
p

ggsave(filename = "output/annoTSS_comparison/barplot_TSS-PSS-KopfTSS.pdf", plot = p, width = 7, height = 7, units = "cm")
write.csv(as.data.frame(PSS_positions_Ranges[which(countOverlaps(PSS_positions_Ranges, anno_TSS)>0)]), file="output/annoTSS_comparison/PSS_overlapping_annoTSS.csv")

Calculate how many PSS overlap with annotated regions:

sum(countOverlaps(PSS_positions_Ranges,TUs)>0|countOverlaps(PSS_positions_Ranges,features)>0)/length(PSS_positions_Ranges)
## [1] 0.9845275

Calculate number and percentage of TUs and CDS which are overlapped by captured PSS:

sum(countOverlaps(TUs,PSS_positions_Ranges)>0)
## [1] 1061
sum(countOverlaps(TUs,PSS_positions_Ranges)>0)/length(TUs)
## [1] 0.2593498
sum(countOverlaps(features,PSS_positions_Ranges)>0)
## [1] 1321
sum(countOverlaps(features,PSS_positions_Ranges)>0)/length(features)
## [1] 0.215357

2.2.2 Extract DNA sequences upstream of TSS and upstream + downstream of PSS

syne_fasta <- readDNAStringSet("input/Synechocystis.fasta", format="fasta")

The following chunk of code filters TSS and PSS positions, so that the resulting GRanges objects only encompass TSS/PSS positions which are the most populated in a TSS/PSS region (a stretch of directly adjacent TSS/PSS).

TSS_advReduce_Ranges <- advanced_reduce(TSS_nonReduced_Ranges)
PSS_advReduce_Ranges <- advanced_reduce(PSS_nonReduced_Ranges)

Actually interesting stuff is only happening in region -20 nt upstream: Hence, only extract this region.

TSS_DNA_sequences_40nt <- extract_DNA_sequences(TSS_advReduce_Ranges, syne_fasta, len=20, only_upstream=TRUE)
PSS_DNA_sequences <- extract_DNA_sequences(PSS_advReduce_Ranges, syne_fasta, len=10, only_upstream=FALSE)

writeXStringSet(TSS_DNA_sequences_40nt, "output/annoTSS_comparison/weblogos/allTSS_20nt_forWeblogo.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(PSS_DNA_sequences, "output/annoTSS_comparison/weblogos/PSS_10nt_forWeblogo_upAndDownstream.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")

run createWeblogos_10-20nt_DNA.txt for creation of sequence logos

3 Use newly identified sets of TSS and PSS for identifying differentially expressed positions between WT, rne(WT) and rne(5p)

PSS_raw <- read.delim("input/PSS_5ends_combined_5sensing.txt", header=TRUE, row.names=1)
PSS_use <- PSS_raw[PSS_positions_combined,]

TSS_raw <- read.delim("input/TSS_5ends_combined_5sensing.txt", header=TRUE, row.names=1)
TSS_use <- TSS_raw[TSS_positions_combined,]

rm(PSS_raw)
rm(TSS_raw)

coldata <- read.csv("input/colData.csv", row.names=1)

ddsMat_PSS <- DESeqDataSetFromMatrix(countData = PSS_use,
                                 colData = coldata,
                                 design = ~ strain)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_TSS <- DESeqDataSetFromMatrix(countData = TSS_use,
                                 colData = coldata,
                                 design = ~ strain)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_PSS <- DESeq(ddsMat_PSS) 
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsMat_TSS <- DESeq(ddsMat_TSS) 
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
write.csv(data.frame(counts(ddsMat_PSS, normalized=TRUE)), file="output/PSS_normalizedCounts.csv")
write.csv(data.frame(counts(ddsMat_TSS, normalized=TRUE)), file="output/TSS_normalizedCounts.csv")

3.1 Diagnostic Plots

pdf(file="output/DESeq2_Plots/ddsMat_PSS_DispEsts.pdf", width=4.5, height=4.5)
plotDispEsts(ddsMat_PSS, main="PSS comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")
dev.off()
## png 
##   2
pdf(file="output/DESeq2_Plots/ddsMat_TSS_DispEsts.pdf", width=4.5, height=4.5)
plotDispEsts(ddsMat_TSS, main="TSS comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")
dev.off()
## png 
##   2
plotDispEsts(ddsMat_PSS, main="PSS comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")

plotDispEsts(ddsMat_TSS, main="TSS comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")

p <- PCA_plot(ddsMat_PSS, "PSS")
p 

ggsave("output/DESeq2_Plots/ddsMat_PSS_PCA.pdf", plot=p, width=9, height=9, units="cm")

p <- PCA_plot(ddsMat_TSS, "TSS")
p

ggsave("output/DESeq2_Plots/ddsMat_TSS_PCA.pdf", plot=p, width=9, height=9, units="cm")

p <- heatmap_plot(ddsMat_PSS, "PSS")
p

ggsave("output/DESeq2_Plots/ddsMat_PSS_heatMap.pdf", plot=p, width=15, height=12, units="cm")

p <- heatmap_plot(ddsMat_TSS, "TSS")
p

ggsave("output/DESeq2_Plots/ddsMat_TSS_heatMap.pdf", plot=p, width=15, height=12, units="cm")

3.2 Extract Results

# extract results
PSS_result_dWT_WT <- results(ddsMat_PSS, contrast=c("strain", "dWT", "WT")) # dWT/WT -> higher in dWT: higher log2FC
write.csv(PSS_result_dWT_WT[order(PSS_result_dWT_WT$padj),], file="output/DESeq2_resultsTables/results_PSS-dWT-WT.csv")

PSS_result_dWT_TV <- results(ddsMat_PSS, contrast=c("strain", "dWT", "TV")) # dWT/TV -> higher in dWT: higher log2FC
write.csv(PSS_result_dWT_TV[order(PSS_result_dWT_TV$padj),], file="output/DESeq2_resultsTables/results_PSS-dWT-TV.csv")

PSS_result_WT_TV <- results(ddsMat_PSS, contrast=c("strain", "WT", "TV")) # WT/TV -> higher in WT: higher log2FC
write.csv(PSS_result_WT_TV[order(PSS_result_WT_TV$padj),], file="output/DESeq2_resultsTables/results_PSS-WT-TV.csv")

TSS_result_dWT_WT <- results(ddsMat_TSS, contrast=c("strain", "dWT", "WT")) # dWT/WT -> higher in dWT: higher log2FC
write.csv(TSS_result_dWT_WT[order(TSS_result_dWT_WT$padj),], file="output/DESeq2_resultsTables/results_TSS-dWT-WT.csv")

TSS_result_dWT_TV <- results(ddsMat_TSS, contrast=c("strain", "dWT", "TV")) # dWT/TV -> higher in dWT: higher log2FC
write.csv(TSS_result_dWT_TV[order(TSS_result_dWT_TV$padj),], file="output/DESeq2_resultsTables/results_TSS-dWT-TV.csv")

TSS_result_WT_TV <- results(ddsMat_TSS, contrast=c("strain", "WT", "TV")) # WT/TV -> higher in WT: higher log2FC
write.csv(TSS_result_WT_TV[order(TSS_result_WT_TV$padj),], file="output/DESeq2_resultsTables/results_TSS-WT-TV.csv")

Annotate peak regions

indices_plus <- which(strand(PSS_nonReduced_Ranges)=="+")
indices_minus <- which(strand(PSS_nonReduced_Ranges)=="-") 
PSS_names <- c()
PSS_names[indices_plus] <- paste(seqnames(PSS_nonReduced_Ranges[indices_plus]),start(PSS_nonReduced_Ranges[indices_plus]),"plus",sep="-")
PSS_names[indices_minus] <- paste(seqnames(PSS_nonReduced_Ranges[indices_minus]),start(PSS_nonReduced_Ranges[indices_minus]),"minus",sep="-")
rm(indices_plus)
rm(indices_minus)
PSS_nonReduced_Ranges$names <- PSS_names
PSS_nonReduced_Ranges$featuresOverlap <- rep("", length(PSS_nonReduced_Ranges))
PSS_nonReduced_Ranges$TU_overlap <- rep("", length(PSS_nonReduced_Ranges))
for(i in 1:length(PSS_nonReduced_Ranges)){
  PSS_nonReduced_Ranges$featuresOverlap[i] <- paste(features[which(countOverlaps(features,PSS_nonReduced_Ranges[i])>0)]$locus_tag,collapse=",")
  PSS_nonReduced_Ranges$TU_overlap[i] <- paste(TUs[which(countOverlaps(TUs,PSS_nonReduced_Ranges[i])>0)]$index,collapse=",")
}
# create tables with annotation: which features and TUs are overlapping with PSS?
df_PSS_nonRed_Ranges <- as.data.frame(PSS_nonReduced_Ranges)
row.names(df_PSS_nonRed_Ranges) <- df_PSS_nonRed_Ranges$names

# PSS result dWT WT
PSS_result_dWT_WT_annot <- rownames_to_column(as.data.frame(PSS_result_dWT_WT[order(PSS_result_dWT_WT$padj),]))
PSS_result_dWT_WT_annot$CDS <- df_PSS_nonRed_Ranges[PSS_result_dWT_WT_annot$rowname,]$featuresOverlap
PSS_result_dWT_WT_annot$TUs <- df_PSS_nonRed_Ranges[PSS_result_dWT_WT_annot$rowname,]$TU_overlap
PSS_result_dWT_WT_annot <- PSS_result_dWT_WT_annot[,c(8,9,1,2:7)]
write_tsv(PSS_result_dWT_WT_annot, "output/DESeq2_resultsTables/results_dWT_WT_annotated.tsv")

# PSS result dWT TV
PSS_result_dWT_TV_annot <- rownames_to_column(as.data.frame(PSS_result_dWT_TV[order(PSS_result_dWT_TV$padj),]))
PSS_result_dWT_TV_annot$CDS <- df_PSS_nonRed_Ranges[PSS_result_dWT_TV_annot$rowname,]$featuresOverlap
PSS_result_dWT_TV_annot$TUs <- df_PSS_nonRed_Ranges[PSS_result_dWT_TV_annot$rowname,]$TU_overlap
PSS_result_dWT_TV_annot <- PSS_result_dWT_TV_annot[,c(8,9,1,2:7)]
write_tsv(PSS_result_dWT_TV_annot, "output/DESeq2_resultsTables/results_dWT-TV_annotated.tsv")

# PSS result WT, TV
PSS_result_WT_TV_annot <- rownames_to_column(as.data.frame(PSS_result_WT_TV[order(PSS_result_WT_TV$padj),]))
PSS_result_WT_TV_annot$CDS <- df_PSS_nonRed_Ranges[PSS_result_WT_TV_annot$rowname,]$featuresOverlap
PSS_result_WT_TV_annot$TUs <- df_PSS_nonRed_Ranges[PSS_result_WT_TV_annot$rowname,]$TU_overlap
PSS_result_WT_TV_annot <- PSS_result_WT_TV_annot[,c(8,9,1,2:7)]
write_tsv(PSS_result_WT_TV_annot, "output/DESeq2_resultsTables/results_PSS-WT-TV_annotated.tsv")

Run in respective Directory (output/DESeq2_resultsTables/) python changeAnnotation_DESeq2.py results_dWT_WT_annotated.tsv results_dWT_WT_annotated-2.tsv python changeAnnotation_DESeq2.py results_dWT-TV_annotated.tsv results_dWT-TV_annotated-2.tsv python changeAnnotation_DESeq2.py results_PSS-WT-TV_annotated.tsv results_PSS-WT-TV_annotated-2.tsv

pdf(file="output/DESeq2_Plots/ddsMat_PSS-dWT-WT_pValuePlot.pdf", width=4.5, height=4.5)
pvaluePlot(PSS_result_dWT_WT, "PSS dWT WT")
dev.off()
## png 
##   2
pdf(file="output/DESeq2_Plots/ddsMat_PSS-dWT-TV_pValuePlot.pdf", width=4.5, height=4.5)
pvaluePlot(PSS_result_dWT_TV, "PSS dWT dTV")
dev.off()
## png 
##   2
pdf(file="output/DESeq2_Plots/ddsMat_dWT-WT_pValuePlot.pdf", width=4.5, height=4.5)
pvaluePlot(TSS_result_dWT_WT, "TSS dWT WT")
dev.off()
## png 
##   2
pdf(file="output/DESeq2_Plots/ddsMat_TSS-dWT-TV_pValuePlot.pdf", width=4.5, height=4.5)
pvaluePlot(TSS_result_dWT_TV, "TSS dWT TV")
dev.off()
## png 
##   2
pvaluePlot(PSS_result_dWT_WT, "PSS dWT WT")

pvaluePlot(PSS_result_dWT_TV, "PSS dWT dTV")

pvaluePlot(TSS_result_dWT_WT, "TSS dWT WT")

pvaluePlot(TSS_result_dWT_TV, "TSS dWT TV")

Filter out PSS below filter level of results(). For these, p.adj is set to NA - hence: remove positions at which p.adj==NA

PSS_result_dWT_WT <- subset(PSS_result_dWT_WT, !is.na(PSS_result_dWT_WT$padj))
PSS_result_dWT_TV <- subset(PSS_result_dWT_TV, !is.na(PSS_result_dWT_TV$padj))
PSS_result_WT_TV <- subset(PSS_result_WT_TV, !is.na(PSS_result_WT_TV$padj))
TSS_result_dWT_WT <- subset(TSS_result_dWT_WT, !is.na(TSS_result_dWT_WT$padj))
TSS_result_dWT_TV <- subset(TSS_result_dWT_TV, !is.na(TSS_result_dWT_TV$padj))
TSS_result_WT_TV <- subset(TSS_result_WT_TV, !is.na(TSS_result_WT_TV$padj))

3.2.1 PSS dWT WT

count_up_down(PSS_result_dWT_WT, foldchange=1, padjusted=0.05)
## [1] "number of features down:  7"
## [1] "number of features up:  73"
p <- volcanoPlot_ggplot(as.data.frame(PSS_result_dWT_WT), foldchange=1, padjusted=0.05) + scale_colour_manual(values=c("DOWN"="#000000ff", "UP"="#005a96ff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p

ggsave("output/DESeq2_Plots/ddsMat_PSS-0h_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")

p <- MAplot_ggplot(PSS_result_dWT_WT, foldchange=1.0, y_axis_label = "Log2 fold-change(rne(WT)/WT)") + scale_colour_manual(values=c("DOWN"="#000000ff", "UP"="#005a96ff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p

ggsave("output/DESeq2_Plots/ddsMat_PSS-dWT-WT_MAplot.pdf",plot=p, width=15, height=12, units="cm")

3.2.2 PSS dWT-TV

count_up_down(PSS_result_dWT_TV, foldchange=1, padjusted=0.05)
## [1] "number of features down:  178"
## [1] "number of features up:  105"
p <- volcanoPlot_ggplot(as.data.frame(PSS_result_dWT_TV), foldchange=1, padjusted=0.05) + scale_colour_manual(values=c("DOWN"="#e69f00ff", "UP"="#005a96ff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p

ggsave("output/DESeq2_Plots/ddsMat_PSS-dWT-TV_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")

p <- MAplot_ggplot(PSS_result_dWT_TV, foldchange=1.0, y_axis_label = "Log2 fold-change(rne(WT)/rne(5p))") + scale_colour_manual(values=c("DOWN"="#e69f00ff", "UP"="#005a96ff", "NO"="#d3d3d3ff"))# + ylim(-5,+5)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p

ggsave("output/DESeq2_Plots/ddsMat_PSS-1h_MAplot.pdf",plot=p, width=15, height=12, units="cm")

3.2.3 PSS WT-TV

count_up_down(PSS_result_WT_TV, foldchange = 1, padjusted = 0.05)
## [1] "number of features down:  239"
## [1] "number of features up:  147"
p <- MAplot_ggplot(PSS_result_WT_TV, foldchange=1.0, y_axis_label = "Log2 fold-change(WT/rne(5p))") + scale_colour_manual(values=c("DOWN"="#e69f00ff", "UP"="#000000ff", "NO"="#d3d3d3ff"))# + ylim(-5,+5)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p

ggsave("output/DESeq2_Plots/ddsMat_PSS-WT-TV_MAplot.pdf",plot=p, width=15, height=12, units="cm")

3.2.4 TSS dWT WT

count_up_down(TSS_result_dWT_WT, foldchange=1, padjusted=0.05)
## [1] "number of features down:  77"
## [1] "number of features up:  83"
p <- volcanoPlot_ggplot(as.data.frame(TSS_result_dWT_WT), foldchange=1, padjusted=0.05) + scale_colour_manual(values=c("DOWN"="#000000ff", "UP"="#005a96ff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p

ggsave("output/DESeq2_Plots/ddsMat_TSS-0h_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")

p <- MAplot_ggplot(TSS_result_dWT_WT, foldchange=1.0, y_axis_label = "Log2 fold-change(rne(WT)/WT)") + scale_colour_manual(values=c("DOWN"="#000000ff", "UP"="#005a96ff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p

ggsave("output/DESeq2_Plots/ddsMat_TSS-dWT-WT_MAplot.pdf",plot=p, width=15, height=12, units="cm")

3.2.5 TSS dWT-TV

count_up_down(TSS_result_dWT_TV, foldchange=1, padjusted=0.05)
## [1] "number of features down:  18"
## [1] "number of features up:  14"
p <- volcanoPlot_ggplot(as.data.frame(TSS_result_dWT_TV), foldchange=1, padjusted=0.05) + scale_colour_manual(values=c("DOWN"="#e69f00ff", "UP"="#005a96ff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p

ggsave("output/DESeq2_Plots/ddsMat_TSS-dWT-TV_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")

p <- MAplot_ggplot(TSS_result_dWT_TV, foldchange=1.0, y_axis_label = "Log2 fold-change(rne(WT)/rne(5p))") + scale_colour_manual(values=c("DOWN"="#e69f00ff", "UP"="#005a96ff", "NO"="#d3d3d3ff"))# + ylim(-5,+5)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p

ggsave("output/DESeq2_Plots/ddsMat_TSS-1h_MAplot.pdf",plot=p, width=15, height=12, units="cm")

3.2.6 TSS WT-TV

count_up_down(TSS_result_WT_TV, foldchange = 1, padjusted = 0.05)
## [1] "number of features down:  140"
## [1] "number of features up:  212"
p <- MAplot_ggplot(TSS_result_WT_TV, foldchange=1.0, y_axis_label = "Log2 fold-change(WT/rne(5p))") + scale_colour_manual(values=c("DOWN"="#e69f00ff", "UP"="#000000ff", "NO"="#d3d3d3ff"))# + ylim(-5,+5)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p

ggsave("output/DESeq2_Plots/ddsMat_TSS-WT-TV_MAplot.pdf",plot=p, width=15, height=12, units="cm")

3.3 Test if 5’-P / 5’-PPP ratio changes at TSS positions

TSSPSS_at_TSSpositions <- counts(ddsMat_PSSTSS, normalize=FALSE)[names_prob_TSS,] # normalize= TRUE/FALSE: should not make difference, but decided to use FALSE since should introduce less bias

# calculate ratios
dWT1_ratio=TSSPSS_at_TSSpositions[,"dWT1_PSS"]/TSSPSS_at_TSSpositions[,"dWT1_TSS"]
dWT2_ratio=TSSPSS_at_TSSpositions[,"dWT2_PSS"]/TSSPSS_at_TSSpositions[,"dWT2_TSS"]
dWT3_ratio=TSSPSS_at_TSSpositions[,"dWT3_PSS"]/TSSPSS_at_TSSpositions[,"dWT3_TSS"]
WT1_ratio=TSSPSS_at_TSSpositions[,"WT1_PSS"]/TSSPSS_at_TSSpositions[,"WT1_TSS"]
WT2_ratio=TSSPSS_at_TSSpositions[,"WT2_PSS"]/TSSPSS_at_TSSpositions[,"WT2_TSS"]
WT3_ratio=TSSPSS_at_TSSpositions[,"WT3_PSS"]/TSSPSS_at_TSSpositions[,"WT3_TSS"]
TV1_ratio=TSSPSS_at_TSSpositions[,"TV1_PSS"]/TSSPSS_at_TSSpositions[,"TV1_TSS"]
TV2_ratio=TSSPSS_at_TSSpositions[,"TV2_PSS"]/TSSPSS_at_TSSpositions[,"TV2_TSS"]
# create df for plotting
df_TSSPSS_ratio <- data.frame(sample=c(rep("dWT1", length(names_prob_TSS)),rep("dWT2", length(names_prob_TSS)),rep("dWT3", length(names_prob_TSS)),rep("WT1", length(names_prob_TSS)),rep("WT2", length(names_prob_TSS)),rep("WT3", length(names_prob_TSS)),rep("TV1", length(names_prob_TSS)),rep("TV2", length(names_prob_TSS))), type=c(rep("rne(WT)", 3*length(names_prob_TSS)), rep("WT", 3*length(names_prob_TSS)), rep("rne(5p)", 2*length(names_prob_TSS))), TSSPSS_ratio <- c(dWT1_ratio, dWT2_ratio, dWT3_ratio, WT1_ratio, WT2_ratio, WT3_ratio, TV1_ratio, TV2_ratio))
p <- ggplot(data=df_TSSPSS_ratio, aes(x=sample, y=TSSPSS_ratio, fill=type)) + geom_violin() +
  geom_boxplot(width=0.1, color="grey", alpha=0.2, outlier.shape=NA, show.legend = FALSE) +
  scale_fill_viridis(discrete=TRUE, alpha=0.6, option="A") +
  theme_light() +
  labs(fill="", y="PSS/TSS at TSS positions", x="")
p
## Warning: Removed 8 rows containing non-finite values (stat_ydensity).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

df_TSSPSS_ratio_2 <- subset(df_TSSPSS_ratio, !is.na(df_TSSPSS_ratio$TSSPSS_ratio) & !is.infinite(df_TSSPSS_ratio$TSSPSS_ratio))
df_TSSPSS_ratio_2$TSSPSS_ratio <- df_TSSPSS_ratio_2$TSSPSS_ratio + 0.0001
p <- ggplot(data=df_TSSPSS_ratio_2, aes(x=sample, y=TSSPSS_ratio, fill=type)) + geom_violin() +
  geom_boxplot(width=0.1, color="grey", alpha=0.2, outlier.shape=NA, show.legend = FALSE) +
  scale_fill_viridis(discrete=TRUE, alpha=0.6, option="A") +
  theme_light() +
  labs(fill="", y="PSS/TSS at TSS positions", x="") + scale_y_continuous(trans="log10", limits=c(NA,10), breaks=c(0,0.01,0.1,1,10))
p

4 Exploratory data analysis

4.1 Extract Peak Positions and create GRanges objects

advanced_reduce_withBaseMeans() merges adjacent PSS positions. In a second step, peaks which are wider than 1nt are reduced to a 1nt wide position according to the baseMeans of the peaks - the most highly populated position is given. PSS_Ranges_WT are peaks which are higher in rne(WT), PSS_Ranges_TV peaks which are higher in rne(5p). Since there are so few peaks differentially populated in the comparison between rne(WT) and WT, these two sets of peaks will not be analysed more specifically.

4.1.1 For all positions

PSS_Ranges_WT <- advanced_reduce_withBaseMeans(create_GRanges_object_from_resObject(PSS_result_dWT_TV, foldchange=1, padjusted=0.05))
PSS_Ranges_TV <- advanced_reduce_withBaseMeans(create_GRanges_object_from_resObject(PSS_result_dWT_TV, foldchange=(-1), padjusted=0.05, up=FALSE))
PSS_Ranges <- c(PSS_Ranges_WT, PSS_Ranges_TV)

4.3 Analyse overlap with chromosome/plasmids

Prepare data frames for the plots and count how many PSS are overlapping with which sequence and with how many of the respective features:

# prepare data.frame for barplot
sequences <- names(syne_fasta)
updown <- c(rep("rne(WT)",length(sequences)), rep("rne(5p)",length(sequences)))
df_PSS_sequence_barplot <- data.frame(cbind(sequence=sequences, updown=updown))
df_PSS_sequence_barplot$updown <- factor(df_PSS_sequence_barplot$updown, levels=c("rne(WT)", "rne(5p)"))
df_PSS_sequence_barplot$x_labels <- NA
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence=="BA000022.2",]$x_labels <- "Chr"
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence=="AP004311.1",]$x_labels <- "pSYSA"
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence=="AP004312.1",]$x_labels <- "pSYSG"
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence=="AP004310.1",]$x_labels <- "pSYSM"
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence=="AP006585.1",]$x_labels <- "pSYSX"
df_PSS_sequence_barplot$x_labels <- factor(df_PSS_sequence_barplot$x_labels, levels=c("Chr", "pSYSA", "pSYSG", "pSYSM", "pSYSX"))
df_PSS_sequence_barplot$geno_size <- NA
for(i in df_PSS_sequence_barplot$sequence){
  df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence==i,]$geno_size <- seqlengths(syne_fasta)[i]
}
df_PSS_sequence_barplot$geno_relative_size <- df_PSS_sequence_barplot$geno_size/sum(seqlengths(syne_fasta))*100
df_PSS_sequence_barplot$number_feat_overlap <- NA
df_PSS_sequence_barplot$number_PSS_overlap <- NA 
df_PSS_sequence_barplot$number_feat_total <- NA
df_PSS_sequence_barplot$PSS_kb <- NA

# prepare data frame for number of overlaps with certain features
df_PSS_sequence_overlap_perKb <- data.frame()

# extract number of overlaps with different sequences and features, both for up- and downregulated PSS positions
for(s in sequences){
  index_up <- which(df_PSS_sequence_barplot$sequence==s & df_PSS_sequence_barplot$updown=="rne(WT)")
  index_down <- which(df_PSS_sequence_barplot$sequence==s & df_PSS_sequence_barplot$updown=="rne(5p)") 
  
  df_PSS_sequence_barplot[index_up,"number_feat_overlap"] <- sum(countOverlaps(subset(features, seqnames(features)==s), PSS_Ranges_WT)>0) # count number of features affected
  df_PSS_sequence_barplot[index_up,"number_PSS_overlap"] <- length(subset(PSS_Ranges_WT, seqnames(PSS_Ranges_WT)==s))
  df_PSS_sequence_barplot[index_up,"PSS_kb"] <- length(subset(PSS_Ranges_WT, seqnames(PSS_Ranges_WT)==s))/(seqlengths(syne_fasta[s])/1000)
    
  df_PSS_sequence_barplot[index_down,"number_feat_overlap"] <- sum(countOverlaps(subset(features, seqnames(features)==s), PSS_Ranges_TV)>0) # count number of features affected
  df_PSS_sequence_barplot[index_down,"number_PSS_overlap"] <- length(subset(PSS_Ranges_TV, seqnames(PSS_Ranges_TV)==s))
  df_PSS_sequence_barplot[index_down,"PSS_kb"] <- length(subset(PSS_Ranges_TV, seqnames(PSS_Ranges_TV)==s))/(seqlengths(syne_fasta[s])/1000)
  
  df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence==s,"number_feat_total"] <- length(subset(features, seqnames(features)==s))
}

df_PSS_sequence_barplot$perc_feat <- (df_PSS_sequence_barplot$number_feat_overlap/df_PSS_sequence_barplot$number_feat_total)*100
df_PSS_sequence_barplot$perc_all_feat <- NA
df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(WT)",]$perc_all_feat <- df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(WT)",]$number_PSS_overlap/sum(df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(WT)",]$number_PSS_overlap)*100
df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(5p)",]$perc_all_feat <- df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(5p)",]$number_PSS_overlap/sum(df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(5p)",]$number_PSS_overlap)*100
df_PSS_sequence_barplot
##      sequence  updown x_labels geno_size geno_relative_size number_feat_overlap
## 1  AP004311.1 rne(WT)    pSYSA    103307           2.617342                   2
## 2  AP006585.1 rne(WT)    pSYSX    106004           2.685672                   0
## 3  AP004310.1 rne(WT)    pSYSM    119895           3.037609                   0
## 4  AP004312.1 rne(WT)    pSYSG     44343           1.123455                   0
## 5  BA000022.2 rne(WT)      Chr   3573470          90.535921                  59
## 6  AP004311.1 rne(5p)    pSYSA    103307           2.617342                   0
## 7  AP006585.1 rne(5p)    pSYSX    106004           2.685672                   2
## 8  AP004310.1 rne(5p)    pSYSM    119895           3.037609                   0
## 9  AP004312.1 rne(5p)    pSYSG     44343           1.123455                   0
## 10 BA000022.2 rne(5p)      Chr   3573470          90.535921                  69
##    number_PSS_overlap number_feat_total      PSS_kb perc_feat perc_all_feat
## 1                   4               111 0.038719545  1.801802     4.0404040
## 2                   0               112 0.000000000  0.000000     0.0000000
## 3                   0               134 0.000000000  0.000000     0.0000000
## 4                   0                51 0.000000000  0.000000     0.0000000
## 5                  95              5726 0.026584804  1.030388    95.9595960
## 6                   2               111 0.019359772  0.000000     1.3071895
## 7                   1               112 0.009433606  1.785714     0.6535948
## 8                   1               134 0.008340631  0.000000     0.6535948
## 9                   0                51 0.000000000  0.000000     0.0000000
## 10                149              5726 0.041696166  1.205030    97.3856209
write.csv(df_PSS_sequence_barplot, file="output/overlap_contigs_PSS.csv")
# Plot: PSS per kb
p <- ggplot(data=df_PSS_sequence_barplot, aes(x=x_labels,y=PSS_kb, fill=updown)) + geom_bar(stat="identity", position="dodge") + theme_light()+ xlab("") + ylab("PSS/kb") + scale_fill_manual(name="", values=c("#005a96ff", "#e69f00ff"), breaks=c("rne(WT)", "rne(5p)"), labels=c("rne(WT)", "rne(5p)"))
p

ggsave(filename = "output/barplots_overlapFeatures/barplot_PSSkb_chr_plasmids.pdf", plot = p, width = 14, height = 7, units = "cm")

# Plot: How many features of a certain kind are overlapped by PSS?
p <- ggplot(data=df_PSS_sequence_barplot, aes(x=x_labels,y=perc_feat, fill=updown)) + geom_bar(stat="identity", position="dodge") + theme_light()+ xlab("") + ylab("Features overlapped by PSS (%)") + scale_fill_manual(name="", values=c("#005a96ff", "#e69f00ff"), breaks=c("rne(WT)", "rne(5p)"), labels=c("rne(WT)", "rne(5p)"))
p

ggsave(filename = "output/barplots_overlapFeatures/barplot_percentageFeatures_chr_plasmids.pdf", plot = p, width = 14, height = 7, units = "cm")

# Plots: How many PSS are localized in which kind of feature?
p <- ggplot(data=df_PSS_sequence_barplot, aes(x=updown,y=number_PSS_overlap, fill=x_labels)) + geom_bar(stat="identity", position="stack") + theme_light()+ xlab("") + ylab("Percentage of all PSS (%)") + scale_fill_brewer(palette="Paired", direction=1)#scale_fill_OkabeIto(name="", order=c(2:8,1))
p

ggsave(filename = "output/barplots_overlapFeatures/barplot_absolute_chromo_plasmids_ofAllPSS.pdf", plot = p, width = 7, height = 7, units = "cm")

p <- ggplot(data=df_PSS_sequence_barplot, aes(x=updown,y=number_PSS_overlap, fill=x_labels)) + geom_bar(stat="identity", position="fill") + theme_light()+ xlab("") + ylab("Number PSS") + scale_fill_brewer(palette="Paired", direction=1)#scale_fill_OkabeIto(name="", order=c(2:8,1))
p

ggsave(filename = "output/barplots_overlapFeatures/barplot_perc_chromo_plasmids_ofAllPSS.pdf", plot = p, width = 7, height = 7, units = "cm")

For comparison, plot lengths of different parts of genome:

seqlengths_syne <- as.data.frame(seqlengths(syne_fasta))
seqlengths_syne$name <- NA
seqlengths_syne["BA000022.2",]$name <- "Chr"
seqlengths_syne["AP004311.1",]$name <- "pSYSA"
seqlengths_syne["AP004312.1",]$name <- "pSYSG"
seqlengths_syne["AP004310.1",]$name <- "pSYSM"
seqlengths_syne["AP006585.1",]$name <- "pSYSX"
seqlengths_syne$x <- "a"

p <- ggplot(data=seqlengths_syne, aes(x=x, y=seqlengths(syne_fasta), fill=name)) + geom_bar(stat="identity", position="fill") + theme_light()+ xlab("") + ylab("Percentage of total length") + scale_fill_brewer(palette="Paired", direction=1)#scale_fill_OkabeIto(name="", order=c(2:8,1))
p

ggsave(filename = "output/barplots_overlapFeatures/barplot_perc_chromo_plasmids_lengths.pdf", plot = p, width = 6, height = 7, units = "cm")

4.4 Analyse overlap with different RNA features and number of overlaps with those features

Prepare data frames for the plots and count how many PSS are overlapping with which features, how many features of a certain type are overlapped by PSS, how many PSS are located in each feature per kb.

# prepare data.frame for barplot
types <- rep(c("CDS","5UTR", "3UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc"),3)
updown <- c(rep("up",9), rep("down",9), rep("both",9))
df_PSS_features <- data.frame(cbind(type=types, updown=updown))
df_PSS_features$number_feat_overlap <- NA 
df_PSS_features$number_PSS_overlap <- NA
df_PSS_features$number_total <- NA

# prepare data frame for number of overlaps with certain features
df_PSS_features_overlap_perKb <- data.frame()

# extract number of overlaps with different feature types, both for up- and downregulated PSS positions
types <- c("CDS","5UTR", "3UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc")
for(t in types){
  t_feat <- which(features$type == t)
  
  index_up <- which(df_PSS_features$type==t & df_PSS_features$updown=="up")
  index_down <- which(df_PSS_features$type==t & df_PSS_features$updown=="down") 
  index_both <- which(df_PSS_features$type==t & df_PSS_features$updown=="both") 
  
  df_PSS_features[index_up,"number_feat_overlap"] <- sum(countOverlaps(features[t_feat], PSS_Ranges_WT)>0) # count number of features affected
  df_PSS_features[index_up,"number_PSS_overlap"] <- sum(countOverlaps(PSS_Ranges_WT, features[t_feat])>0) # count where PSS are located
  df_PSS_features[index_down,"number_feat_overlap"] <- sum(countOverlaps(features[t_feat], PSS_Ranges_TV)>0) # count number of features affected
  df_PSS_features[index_down,"number_PSS_overlap"] <- sum(countOverlaps(PSS_Ranges_TV, features[t_feat])>0) # count where PSS are located
  df_PSS_features[index_both,"number_feat_overlap"] <- sum(countOverlaps(features[t_feat], PSS_Ranges_TV)>0 | countOverlaps(features[t_feat], PSS_Ranges_WT)>0) # count number of features affected
  df_PSS_features[index_both,"number_PSS_overlap"] <- sum(countOverlaps(PSS_Ranges_TV, features[t_feat])>0) + sum(countOverlaps(PSS_Ranges_WT, features[t_feat])>0) # count where PSS are located
  
  df_PSS_features[df_PSS_features$type==t,"number_total"] <- length(features[t_feat])
  
  counts_per_kb <- countOverlaps(features[t_feat], PSS_Ranges_WT)/(width(features[t_feat])/1000)
  df_PSS_features_overlap_perKb <- rbind(df_PSS_features_overlap_perKb, data.frame(type=rep(paste(t, "-up", sep=""),length(features[t_feat])), counts_per_kb=counts_per_kb))
  counts_per_kb <- countOverlaps(features[t_feat], PSS_Ranges_TV)/(width(features[t_feat])/1000)
  df_PSS_features_overlap_perKb <- rbind(df_PSS_features_overlap_perKb, data.frame(type=rep(paste(t, "-down", sep=""),length(features[t_feat])), counts_per_kb=counts_per_kb))
}
df_PSS_features
##      type updown number_feat_overlap number_PSS_overlap number_total
## 1     CDS     up                  53                 83         3675
## 2    5UTR     up                   4                  4          979
## 3    3UTR     up                   0                  0           29
## 4    tRNA     up                   3                  3           42
## 5    rRNA     up                   0                  0            6
## 6   ncRNA     up                   0                  0          318
## 7   asRNA     up                   1                  1         1071
## 8  CRISPR     up                   0                  0            3
## 9    misc     up                   0                  0           11
## 10    CDS   down                  62                117         3675
## 11   5UTR   down                   6                  7          979
## 12   3UTR   down                   1                  2           29
## 13   tRNA   down                   1                  1           42
## 14   rRNA   down                   0                  0            6
## 15  ncRNA   down                   1                  2          318
## 16  asRNA   down                   0                  0         1071
## 17 CRISPR   down                   0                  0            3
## 18   misc   down                   0                  0           11
## 19    CDS   both                  93                200         3675
## 20   5UTR   both                  10                 11          979
## 21   3UTR   both                   1                  2           29
## 22   tRNA   both                   4                  4           42
## 23   rRNA   both                   0                  0            6
## 24  ncRNA   both                   1                  2          318
## 25  asRNA   both                   1                  1         1071
## 26 CRISPR   both                   0                  0            3
## 27   misc   both                   0                  0           11
write.csv(df_PSS_features, file="output/overlap_RNAfeatures_PSS.csv")

for TUs:

sum(countOverlaps(TUs, PSS_Ranges_WT)>0)
## [1] 63
sum(countOverlaps(TUs, PSS_Ranges_TV)>0)
## [1] 82
sum((countOverlaps(TUs, PSS_Ranges_WT)|countOverlaps(TUs, PSS_Ranges_TV))>0)
## [1] 119

4.4.1 Create bar plots

Do further processing of data frames.

df_PSS_features_barplot <- subset(df_PSS_features, df_PSS_features$updown=="up" | df_PSS_features$updown=="down")

total_overlaps_up <- sum(countOverlaps(PSS_Ranges_WT, features)>0)
total_overlaps_down <- sum(countOverlaps(PSS_Ranges_TV, features)>0)

df_PSS_features_barplot[,"number_feat_overlap"] <- as.integer(df_PSS_features_barplot[,"number_feat_overlap"])/as.integer(df_PSS_features_barplot[,"number_total"])*100

# create column  with absolute numbers
df_PSS_features_barplot$number_PSS_overlap_absolute <- NA
df_PSS_features_barplot[df_PSS_features_barplot$updown=="up",]$number_PSS_overlap_absolute <- as.integer(df_PSS_features_barplot[df_PSS_features_barplot$updown=="up","number_PSS_overlap"])
df_PSS_features_barplot[df_PSS_features_barplot$updown=="down",]$number_PSS_overlap_absolute <- as.integer(df_PSS_features_barplot[df_PSS_features_barplot$updown=="down","number_PSS_overlap"])

# calculate percentages
df_PSS_features_barplot[df_PSS_features_barplot$updown=="up","number_PSS_overlap"] <- as.integer(df_PSS_features_barplot[df_PSS_features_barplot$updown=="up","number_PSS_overlap"])/total_overlaps_up *100
df_PSS_features_barplot[df_PSS_features_barplot$updown=="down","number_PSS_overlap"] <- as.integer(df_PSS_features_barplot[df_PSS_features_barplot$updown=="down","number_PSS_overlap"])/total_overlaps_down *100

# rename types (5'UTR instead of 5UTR, 3'UTR instead of 3UTR)
df_PSS_features_barplot[,"type"] <- rep(c("CDS","5'UTR", "3'UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc"),2)

# reduce plot to information I actually care about
df_PSS_features_barplot <- df_PSS_features_barplot[which(!df_PSS_features_barplot$type=="rRNA"),] # rRNAs cannot be counted here, because multireads were excluded from analysis
df_PSS_features_barplot <- df_PSS_features_barplot[which(!df_PSS_features_barplot$type=="misc"),] # misc are actually only genes which are split by plasmid / chromosome beginning and two features (NC-2306445, NC-3547510) where I never figured out what they are. One PSS overlaps with sll8049-1. So I think one can savely ignore those

Do actual plotting.

# cast type, updown as factors to influence order in which they show up in the plots
df_PSS_features_barplot$type <- factor(df_PSS_features_barplot$type, levels=c("tRNA", "asRNA", "ncRNA", "CRISPR", "3'UTR", "5'UTR", "CDS"))
df_PSS_features_barplot$updown <- c(rep("rne(WT)", 7), rep("rne(5p)",7))
df_PSS_features_barplot$updown <- factor(df_PSS_features_barplot$updown, levels=c("rne(WT)", "rne(5p)"))

# Plot: How many features of a certain kind are overlapped by PSS?
p <- ggplot(data=df_PSS_features_barplot, aes(x=type,y=number_feat_overlap, fill=updown)) + geom_bar(stat="identity", position="dodge") + theme_light()+ xlab("") + ylab("Percentage PSS (%)") + scale_fill_manual(name="", values=c("#005a96ff", "#e69f00ff"), breaks=c("rne(WT)", "rne(5p)"), labels=c("rne(WT)", "rne(5p)"))
p

ggsave(filename = "output/barplots_overlapFeatures/barplot_percentage_of_allThisFeature.pdf", plot = p, width = 14, height = 7, units = "cm")

# Plots: How many PSS are localized in which kind of feature?
p <- ggplot(data=df_PSS_features_barplot, aes(x=updown,y=number_PSS_overlap, fill=type)) + geom_bar(stat="identity", position="stack") + theme_light()+ xlab("") + ylab("Percentage of all PSS (%)") + scale_fill_brewer(palette="Paired", direction=1)#scale_fill_OkabeIto(name="", order=c(2:8,1))
p

ggsave(filename = "output/barplots_overlapFeatures/barplot_percentage_ofAllPSS.pdf", plot = p, width = 7, height = 7, units = "cm")

p <- ggplot(data=df_PSS_features_barplot, aes(x=updown,y=number_PSS_overlap_absolute, fill=type)) + geom_bar(stat="identity", position="stack") + theme_light()+ xlab("") + ylab("Number PSS") + scale_fill_brewer(palette="Paired", direction=1)#scale_fill_OkabeIto(name="", order=c(2:8,1))
p

ggsave(filename = "output/barplots_overlapFeatures/barplot_absolute_ofAllPSS.pdf", plot = p, width = 7, height = 7, units = "cm")

4.4.2 Plots of number PSS per feature

Exclude rRNA, tRNA, misc from plots since not much information in those.

df_PSS_features_overlap_perKb <- df_PSS_features_overlap_perKb[which(!df_PSS_features_overlap_perKb$type=="rRNA-up"),]
df_PSS_features_overlap_perKb <- df_PSS_features_overlap_perKb[which(!df_PSS_features_overlap_perKb$type=="rRNA-down"),]
df_PSS_features_overlap_perKb <- df_PSS_features_overlap_perKb[which(!df_PSS_features_overlap_perKb$type=="tRNA-up"),]
df_PSS_features_overlap_perKb <- df_PSS_features_overlap_perKb[which(!df_PSS_features_overlap_perKb$type=="tRNA-down"),]
df_PSS_features_overlap_perKb <- df_PSS_features_overlap_perKb[which(!df_PSS_features_overlap_perKb$type=="misc-up"),]
df_PSS_features_overlap_perKb <- df_PSS_features_overlap_perKb[which(!df_PSS_features_overlap_perKb$type=="misc-down"),]

Violin plot for a first overview.

p <- ggplot(data=df_PSS_features_overlap_perKb, aes(x=type, y=counts_per_kb, fill=type)) + geom_violin() +
  geom_boxplot(width=0.1, color="grey", alpha=0.2, outlier.shape=NA, show.legend = FALSE) +
  scale_fill_viridis(discrete=TRUE, alpha=0.6, option="A") +
  theme_light() +
  labs(fill="", y="Number of PSS per feature per kb", x="")
p

Plot with log-transformation + statistics for all features (add pseudocount of 0.0001 to avoid 0-problem of log-transformation).

df_PSS_features_overlap_perKb$type <- as.factor(df_PSS_features_overlap_perKb$type)
magma_colors <- magma(length(levels(df_PSS_features_overlap_perKb$type))/2, alpha = 0.6, begin = 0, end = 1, direction = 1)[c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9)]

p <- ggplot(data=df_PSS_features_overlap_perKb, aes(x=type, y=counts_per_kb +0.0001, fill=type)) + 
  geom_boxplot(outlier.shape=NA) + geom_jitter(color="black", size=0.05, alpha=0.9, show.legend=FALSE)+ 
  scale_fill_manual(values=magma_colors)+
  theme_light()+ theme(legend.position = "none") +
  labs(fill="", y="Number of PSS per kb", x="") + scale_y_continuous(trans="log10", limits=c(NA,100), breaks=c(0,2,4,8,16,32,64)) + scale_x_discrete(breaks=c("3UTR-down", "3UTR-up", "5UTR-down", "5UTR-up", "asRNA-down", "asRNA-up", "CDS-down", "CDS-up", "CRISPR-down", "CRISPR-up", "ncRNA-down","ncRNA-up"), labels=c("Ts\n3'UTR", "WT\n3'UTR", "Ts\n5'UTR", "WT\n5'UTR", "Ts\nasRNA", "WT\nasRNA", "Ts\nCDS", "WT\nCDS", "Ts\nCRISPR", "WT\nCRISPR", "Ts\nncRNA", "WT\nncRNA"))
#        labels=rep(c("Ts", "WT"), 6))
p 

Plot with log-transformation, without pseudocounts: Boxplots show statistics for features which are cleaved at all, not for all features irrespective if cleaved or not.

df_PSS_features_overlap_perKb$type <- as.factor(df_PSS_features_overlap_perKb$type)
magma_colors <- magma(length(levels(df_PSS_features_overlap_perKb$type))/2, alpha = 0.6, begin = 0, end = 1, direction = 1)[c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9)]

p <- ggplot(data=df_PSS_features_overlap_perKb, aes(x=type, y=counts_per_kb, fill=type)) + 
  geom_boxplot(outlier.shape=NA) + geom_jitter(color="black", size=0.05, alpha=0.9, show.legend=FALSE)+ 
  scale_fill_manual(values=magma_colors)+
  theme_light()+ theme(legend.position = "none") +
  labs(fill="", y="Number of PSS per kb", x="") + scale_y_continuous(trans="log10") + scale_x_discrete(breaks=c("3UTR-down", "3UTR-up", "5UTR-down", "5UTR-up", "asRNA-down", "asRNA-up", "CDS-down", "CDS-up", "CRISPR-down", "CRISPR-up", "ncRNA-down","ncRNA-up"), labels=c("Ts\n3'UTR", "WT\n3'UTR", "Ts\n5'UTR", "WT\n5'UTR", "Ts\nasRNA", "WT\nasRNA", "Ts\nCDS", "WT\nCDS", "Ts\nCRISPR", "WT\nCRISPR", "Ts\nncRNA", "WT\nncRNA"))
#        labels=rep(c("Ts", "WT"), 6))
p 
## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 12022 rows containing non-finite values (stat_boxplot).

ggsave("output/barplots_overlapFeatures/violin_NumberPSS_perKb.pdf", plot=p, width = 14, height = 7, units="cm")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 12022 rows containing non-finite values (stat_boxplot).

4.4.3 Further analysis of statistics of number of PSS overlapping certain feature

Statistics for different feature types:

summary(df_PSS_features_overlap_perKb[which(grepl("CRISPR-up", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
summary(df_PSS_features_overlap_perKb[which(grepl("CRISPR-down", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
summary(df_PSS_features_overlap_perKb[which(grepl("CDS-up", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.03747  0.00000 12.34568
summary(df_PSS_features_overlap_perKb[which(grepl("CDS-down", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.06156  0.00000 19.60784
summary(df_PSS_features_overlap_perKb[which(grepl("asRNA-up", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.005335 0.000000 5.714286
summary(df_PSS_features_overlap_perKb[which(grepl("asRNA-down", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
summary(df_PSS_features_overlap_perKb[which(grepl("ncRNA-up", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
summary(df_PSS_features_overlap_perKb[which(grepl("ncRNA-down", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.07487  0.00000 23.80952
summary(df_PSS_features_overlap_perKb[which(grepl("5UTR-up", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.07112  0.00000 32.25806
summary(df_PSS_features_overlap_perKb[which(grepl("5UTR-down", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1611  0.0000 43.4783
summary(df_PSS_features_overlap_perKb[which(grepl("3UTR-up", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
summary(df_PSS_features_overlap_perKb[which(grepl("3UTR-down", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3789  0.0000 10.9890
summary(df_PSS_features_overlap_perKb[which(grepl("up", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.03507  0.00000 32.25806
summary(df_PSS_features_overlap_perKb[which(grepl("down", df_PSS_features_overlap_perKb$type, fixed=TRUE)),]$counts_per_kb)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.06893  0.00000 43.47826

4.5 GO and KEGG enrichment in set of detected PSS

Control if, in general, PSS are rather detected in certain set of genes. GO annotation only exists for CDS, not for other features: Therefore, only use CDS for functional enrichment.

Detectable PSS are enriched in genes associated with photosynthesis or ribosomes. I assume both sets are, compared to other sets of genes, highly transcribed and that’s the reason why PSS can be detected in those genes. Probably, there are also stable processed RNA fragments created from other sets of transcripts, but they might be too lowly transcribed to be captured by tagRNA-Seq.

CDS_features <- features[which(features$type=="CDS")]
locus_tags_all_PSS <- CDS_features[which(countOverlaps(CDS_features, PSS_advReduce_Ranges)>0)]$locus_tag
# compared to all CDS
kegg_PSS_all_enrich <- kegg_functional_enrichment_locusList(locus_tags_all_PSS, CDS_features$locus_tag, write=TRUE, path="output/enrichment/KEGG_allPSS.csv")
## Reading KEGG annotation online:
## 
## Reading KEGG annotation online:
##                ID                       Description GeneRatio BgRatio
## syn00195 syn00195                    Photosynthesis    37/330  63/953
## syn03010 syn03010                          Ribosome    32/330  54/953
## syn00196 syn00196 Photosynthesis - antenna proteins    12/330  15/953
## syn00190 syn00190         Oxidative phosphorylation    28/330  49/953
##                pvalue    p.adjust      qvalue
## syn00195 4.488607e-05 0.002782937 0.002504170
## syn03010 1.220216e-04 0.003782668 0.003403759
## syn00196 3.866458e-04 0.007990680 0.007190255
## syn00190 7.756333e-04 0.012022316 0.010818044
go_PSS_all_enrich <- go_functional_enrichment_locusList(locus_tags_all_PSS, CDS_features$locus_tag, write=TRUE, path="output/enrichment/GO_allPSS.csv")
##                    ID                        Description GeneRatio BgRatio
## GO:0042651 GO:0042651                 thylakoid membrane    58/724 96/2593
## GO:0018298 GO:0018298        protein-chromophore linkage    22/724 31/2593
## GO:0003735 GO:0003735 structural constituent of ribosome    33/724 56/2593
## GO:0015979 GO:0015979                     photosynthesis    45/724 86/2593
## GO:0048038 GO:0048038                    quinone binding    17/724 22/2593
## GO:0006412 GO:0006412                        translation    45/724 90/2593
##                  pvalue     p.adjust       qvalue
## GO:0042651 1.179467e-11 2.040478e-09 1.614008e-09
## GO:0018298 6.803585e-07 4.424105e-05 3.499444e-05
## GO:0003735 8.886063e-07 4.424105e-05 3.499444e-05
## GO:0015979 1.022914e-06 4.424105e-05 3.499444e-05
## GO:0048038 1.957421e-06 6.772676e-05 5.357152e-05
## GO:0006412 5.197638e-06 1.367076e-04 1.081350e-04
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     geneID
## GO:0042651 slr1311/sll0223/sll1327/sll1326/sll1324/sll1323/ssl2615/sll1322/slr1291/smr0005/slr1604/ssr3451/slr1034/slr1390/sll1814/ssl3335/slr1834/slr1835/sll1463/sll1281/ssl2501/sml0007/sll1697/sll0851/sll0849/slr0261/slr1513/slr1329/slr1330/sml0008/ssr1386/slr0823/sll1262/sll1867/slr1279/slr1280/slr1281/ssr2831/slr1645/sll0258/slr1949/ssl0563/slr0331/sml0001/smr0001/slr0342/slr0343/sll0199/slr0228/sll0689/slr0906/ssl1690/slr0083/slr0927/sll0522/sll0520/sll0519/smr0004
## GO:0018298                                                                                                                                                                                                                                                                                                 slr1311/sll0928/slr0687/sll1578/sll1577/sll1124/slr1834/slr1835/slr0854/sll0851/sll0849/slr2067/slr1986/slr1963/slr1969/sll1867/slr0335/slr0473/slr0906/sll0041/slr0927/slr1459
## GO:0003735                                                                                                                                                                                                         ssr1604/ssl1784/ssr1736/sll1824/ssl3445/sll1821/sll1819/sll1816/sml0006/sll1813/sll1808/sll1803/sll1802/ssl2233/sll1746/sll1744/sll1740/slr1356/sll1767/sll1260/sll1244/slr1984/sll1101/sll1097/sll1096/ssr2799/sll0767/ssl1426/ssl0601/slr0469/slr0628/slr0923/ssr1398
## GO:0015979                                                                                                         sll1214/sll1398/slr0737/sll1032/sll1031/sll1029/sll1028/sll1194/sll1184/sll0928/smr0005/slr1055/sll1638/ssr3451/slr2051/ssl3093/sll1580/sll1578/sll1577/slr1834/slr1835/sll1281/sml0007/slr2067/slr1986/ssr3383/sll1418/sml0008/sll0819/slr0823/slr1347/sll1091/ssr2831/sll0427/sll0258/slr0335/sml0001/slr0772/smr0001/slr0342/slr0436/slr0525/slr1459/sll1471/smr0004
## GO:0048038                                                                                                                                                                                                                                                                                                                                         sll0223/slr2007/sll1732/slr0261/ssr1386/sll1262/slr1279/slr1280/slr1281/slr0331/ssl1690/slr0565/slr0844/sll0522/sll0521/sll0520/sll0519
## GO:0006412                                                                                                         ssr1604/ssl1784/ssr1736/sll1824/ssl3445/sll1821/sll1819/sll1816/sml0006/sll1813/sll1808/sll1803/sll1802/ssl2233/sll1746/sll1744/sll1740/slr1550/sll1362/slr1356/slr0877/sll1767/slr1720/sll1425/sll1260/sll1244/slr1984/sll1435/sll1101/sll1097/sll1096/ssr2799/sll1553/sll0145/sll0767/ssl1426/ssl0601/slr0469/slr0628/slr0638/slr0923/sll0546/ssr1398/slr1560/slr0604
##            Count
## GO:0042651    58
## GO:0018298    22
## GO:0003735    33
## GO:0015979    45
## GO:0048038    17
## GO:0006412    45
p <- dotplot(kegg_PSS_all_enrich)
## wrong orderBy parameter; set to default `orderBy = "x"`
p

ggsave("output/enrichment/KEGG_dotplot_allPSS.pdf", plot=p, width=20, height=14, units="cm")
p <- dotplot(go_PSS_all_enrich, showCategory=26)
## wrong orderBy parameter; set to default `orderBy = "x"`
p

ggsave("output/enrichment/GO_dotplot_allPSS.pdf", plot=p, width=20, height=14, units="cm")

Compared to the positions where PSS were generally detected, no terms are enriched in set of PSS upregulated (higher in rne(WT)).

# enrichment in rne(WT) PSS
locus_tags_up <- CDS_features[which(countOverlaps(CDS_features, PSS_Ranges_WT)>0)]$locus_tag
# compared to all CDS
kegg_PSS_up_enrich <- kegg_functional_enrichment_locusList(locus_tags_up, CDS_features$locus_tag, write=TRUE, path="output/enrichment/KEGG_PSS_rneWT_universeAllCDS.csv")
##                ID                       Description GeneRatio BgRatio
## syn00196 syn00196 Photosynthesis - antenna proteins      4/21  15/953
##                pvalue    p.adjust      qvalue
## syn00196 0.0002040451 0.003060677 0.002577412
go_PSS_up_enrich <- go_functional_enrichment_locusList(locus_tags_up, CDS_features$locus_tag, write=TRUE, path="output/enrichment/GO_PSS_rneWT_universeAllCDS.csv")
##                    ID                 Description GeneRatio BgRatio
## GO:0030089 GO:0030089               phycobilisome      5/41 24/2593
## GO:0018298 GO:0018298 protein-chromophore linkage      5/41 31/2593
## GO:0015979 GO:0015979              photosynthesis      7/41 86/2593
##                  pvalue    p.adjust      qvalue
## GO:0030089 2.622924e-05 0.001914734 0.001822242
## GO:0018298 9.663725e-05 0.003527260 0.003356873
## GO:0015979 3.105628e-04 0.007557028 0.007191981
##                                                             geneID Count
## GO:0030089                 slr2051/sll1578/slr1986/slr1963/slr0335     5
## GO:0018298                 sll1578/slr1834/slr1986/slr1963/slr0335     5
## GO:0015979 sll1214/sll1184/slr2051/sll1578/slr1834/slr1986/slr0335     7
# compared to universe: Where are PSS generally localized / what was detectable
kegg_PSS_up_enrich_2 <- kegg_functional_enrichment_locusList(list_of_genes=locus_tags_up, universe_tags=locus_tags_all_PSS, write=TRUE, path="output/enrichment/KEGG_PSS_rneWT_universeDetectedCDS.csv")
## [1] ID          Description GeneRatio   BgRatio     pvalue      p.adjust   
## [7] qvalue     
## <0 rows> (or 0-length row.names)
go_PSS_up_enrich_2 <- go_functional_enrichment_locusList(locus_tags_up, universe_tags=locus_tags_all_PSS, write=TRUE, path="output/enrichment/GO_PSS_rneWT_universeDetectedCDS.csv")
##                    ID   Description GeneRatio BgRatio       pvalue   p.adjust
## GO:0030089 GO:0030089 phycobilisome      5/41  14/724 0.0006253973 0.02689209
##                qvalue                                  geneID Count
## GO:0030089 0.02633252 slr2051/sll1578/slr1986/slr1963/slr0335     5

When using all positions of PSS as universe, GO term phycobilisome is enriched in the PSS ranges rne(WT) data set.

# enrichment in rne(TV) PSS
locus_tags_down <- CDS_features[which(countOverlaps(CDS_features, PSS_Ranges_TV)>0)]$locus_tag
# compared to all CDS
kegg_PSS_down_enrich <- kegg_functional_enrichment_locusList(locus_tags_down, CDS_features$locus_tag, write=TRUE, path="output/enrichment/KEGG_PSS_TV_universeAllCDS.csv")
##                ID    Description GeneRatio BgRatio       pvalue    p.adjust
## syn00195 syn00195 Photosynthesis      8/29  63/953 0.0003317084 0.007961001
##               qvalue
## syn00195 0.007681667
go_PSS_down_enrich <- go_functional_enrichment_locusList(locus_tags_down, CDS_features$locus_tag, write=TRUE, path="output/enrichment/GO_PSS_TV_universeAllCDS.csv")
##                    ID                 Description GeneRatio BgRatio
## GO:0042651 GO:0042651          thylakoid membrane     12/52 96/2593
## GO:0015979 GO:0015979              photosynthesis     11/52 86/2593
## GO:0018298 GO:0018298 protein-chromophore linkage      7/52 31/2593
## GO:0016168 GO:0016168         chlorophyll binding      4/52 10/2593
## GO:0030089 GO:0030089               phycobilisome      4/52 24/2593
##                  pvalue     p.adjust       qvalue
## GO:0042651 2.016019e-07 1.632976e-05 1.443046e-05
## GO:0015979 5.573214e-07 2.257152e-05 1.994624e-05
## GO:0018298 1.568669e-06 4.235406e-05 3.742789e-05
## GO:0016168 2.765964e-05 5.601076e-04 4.949619e-04
## GO:0030089 1.136353e-03 1.840891e-02 1.626779e-02
##                                                                                                     geneID
## GO:0042651 slr1311/sll1814/slr1834/slr1835/sll0851/slr1513/slr1330/sml0008/slr0823/slr1281/ssr2831/sll0199
## GO:0015979         sll1214/sll1184/slr2051/sll1578/slr1834/slr1835/slr1986/sml0008/slr0823/sll1091/ssr2831
## GO:0018298                                         slr1311/sll1578/slr1834/slr1835/sll0851/slr1986/slr1963
## GO:0016168                                                                 slr1311/slr1834/slr1835/sll0851
## GO:0030089                                                                 slr2051/sll1578/slr1986/slr1963
##            Count
## GO:0042651    12
## GO:0015979    11
## GO:0018298     7
## GO:0016168     4
## GO:0030089     4
# compared to universe: Where are PSS generally localized / what was detectable
kegg_PSS_down_enrich_2 <- kegg_functional_enrichment_locusList(locus_tags_down, locus_tags_all_PSS, write=TRUE, path="output/enrichment/KEGG_PSS_TV_universeDetectedCDS.csv")
## [1] ID          Description GeneRatio   BgRatio     pvalue      p.adjust   
## [7] qvalue     
## <0 rows> (or 0-length row.names)
go_PSS_down_enrich_2 <- go_functional_enrichment_locusList(locus_tags_down, locus_tags_all_PSS, write=TRUE, path="output/enrichment/GO_PSS_TV_universeDetectedCDS.csv")
##                    ID                 Description GeneRatio BgRatio
## GO:0015979 GO:0015979              photosynthesis     11/52  45/724
## GO:0042651 GO:0042651          thylakoid membrane     12/52  58/724
## GO:0018298 GO:0018298 protein-chromophore linkage      7/52  22/724
##                  pvalue    p.adjust      qvalue
## GO:0015979 0.0001423478 0.008256170 0.007941506
## GO:0042651 0.0003811497 0.009393589 0.009035575
## GO:0018298 0.0004858753 0.009393589 0.009035575
##                                                                                                     geneID
## GO:0015979         sll1214/sll1184/slr2051/sll1578/slr1834/slr1835/slr1986/sml0008/slr0823/sll1091/ssr2831
## GO:0042651 slr1311/sll1814/slr1834/slr1835/sll0851/slr1513/slr1330/sml0008/slr0823/slr1281/ssr2831/sll0199
## GO:0018298                                         slr1311/sll1578/slr1834/slr1835/sll0851/slr1986/slr1963
##            Count
## GO:0015979    11
## GO:0042651    12
## GO:0018298     7
p <- dotplot(go_PSS_down_enrich_2)
## wrong orderBy parameter; set to default `orderBy = "x"`
p

ggsave("output/enrichment/GO_dotplot_PSS_rneTV.pdf", plot=p, width=20, height=14, units="cm")

4.6 Distance of PSS to Start / Stop position

count_overlaps_xNt_upDownstream <- function(Ranges_object, Ranges_object_2, sequence_object, start_stop="start", len=25){
  # function returns df with total number overlapping between Ranges_object and Ranges_object_2, iterating from start or stop codon (choose by setting start_stop to "start" or "stop") of Ranges_object +/- nucleotide positions. 
  
  # prepare df to return
  df_overlaps <- data.frame(rbind(rep("NA", length((-len):(+len)))))
  names(df_overlaps) <- (-len):(+len)
    
  # extract lengths of sequences (needed for catching cases in which start_end -len or +len exceeds chromosome/plasmid length) -> if this is the case: since circular DNA: start from other end with counting again (see below)
  lengths <- c()
  for(i in 1:length(sequence_object)){
    lengths[i] <- width(sequence_object[i])
  }
  names(lengths) <- names(sequence_object)
  
  # extract info about what to extract (depending on strength: upstream corresponds to either + or -)
  strands <- data.frame(Ranges_object)[,5]
  plus_strand <- which(strand(Ranges_object)=="+")
  minus_strand <- which(strand(Ranges_object)=="-")
  seqs <- as.character(data.frame(Ranges_object)[,1])
  
  for(d in (-len):(+len)){
    positions <- c()
  
    # get positions of start_end on plus and minus strand
    if(start_stop=="start"){
      positions[plus_strand] <- start(Ranges_object[plus_strand])-(-d) # e.g. - (- -25) -> corresponds to -25
      positions[minus_strand] <- end(Ranges_object[minus_strand])+(-d) # e.g. + (- -25) -> corresponds to +25
    }else{
      positions[plus_strand] <- end(Ranges_object[plus_strand])-(-d) # e.g. - (- -25) -> corresponds to -25
      positions[minus_strand] <- start(Ranges_object[minus_strand])+(-d) # e.g. + (- -25) -> corresponds to +25
    }
    # positions < 0
    indices <- which(positions < 0)
    positions[indices] <- lengths[seqs[indices]]-(-positions[indices])
    
    # positions at which positions > length of respective plasmid / chromosome
    indices <- which(positions>lengths[seqs])
    positions[indices] <- positions[indices]-lengths[seqs[indices]]
    
    # create GRanges object
    temp_GRanges <- GRanges(seqnames=seqs, ranges=IRanges(start = positions, end=positions), strand = strands)
    
    # count overlaps    
    overlap_vect <- countOverlaps(temp_GRanges, Ranges_object_2)
    df_overlaps[,as.character(d)] <- sum(overlap_vect)
  }
  
  return(df_overlaps)
}

4.6.1 For PSS which are upregulated in rne(WT)

overlaps_PSS_up_start_CDS <- count_overlaps_xNt_upDownstream(features[which(features$type=="CDS")], PSS_Ranges_WT, syne_fasta, len = 300, start_stop = "start")
overlaps_PSS_up_stop_CDS <- count_overlaps_xNt_upDownstream(features[which(features$type=="CDS")], PSS_Ranges_WT, syne_fasta, len = 300, start_stop = "stop")

overlaps_PSS_up_start_CDS_ggplot <- data.frame(t(overlaps_PSS_up_start_CDS))
overlaps_PSS_up_start_CDS_ggplot$nt_pos <- as.numeric(row.names(overlaps_PSS_up_start_CDS_ggplot))
names(overlaps_PSS_up_start_CDS_ggplot)[1] <- "count"

overlaps_PSS_up_stop_CDS_ggplot <- data.frame(t(overlaps_PSS_up_stop_CDS))
overlaps_PSS_up_stop_CDS_ggplot$nt_pos <- as.numeric(row.names(overlaps_PSS_up_stop_CDS_ggplot))
names(overlaps_PSS_up_stop_CDS_ggplot)[1] <- "count"
p <- ggplot(overlaps_PSS_up_start_CDS_ggplot, aes(x=nt_pos, y=count)) + geom_line(size=0.5, color="#005a96ff") + theme_light() + labs(x="Distance relative to start codon (nt)", y="Number of cleavage sites")+ geom_vline(aes(xintercept=0), linetype=2, color="darkgray")# + ylim(0,8)
p

ggsave("output/distance_startStop/PSS_rneWT_count_relativeStartCodon.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(overlaps_PSS_up_stop_CDS_ggplot, aes(x=nt_pos, y=count)) + geom_line(size=0.5, color="#005a96ff") + theme_light() + labs(x="Distance relative to stop codon (nt)", y="Number of cleavage sites") + geom_vline(aes(xintercept=0), linetype=2, color="darkgray")# + ylim(0,8)
p

ggsave("output/distance_startStop/PSS_rneWT_count_relativeStopCodon.pdf", plot=p, width=10, height=7, units="cm")

4.6.2 For PSS which are upregulated in rne(5p)

overlaps_PSS_down_start_CDS <- count_overlaps_xNt_upDownstream(features[which(features$type=="CDS")], PSS_Ranges_TV, syne_fasta, len = 300, start_stop = "start")
overlaps_PSS_down_stop_CDS <- count_overlaps_xNt_upDownstream(features[which(features$type=="CDS")], PSS_Ranges_TV, syne_fasta, len = 300, start_stop = "stop")

overlaps_PSS_down_start_CDS_ggplot <- data.frame(t(overlaps_PSS_down_start_CDS))
overlaps_PSS_down_start_CDS_ggplot$nt_pos <- as.numeric(row.names(overlaps_PSS_down_start_CDS_ggplot))
names(overlaps_PSS_down_start_CDS_ggplot)[1] <- "count"

overlaps_PSS_down_stop_CDS_ggplot <- data.frame(t(overlaps_PSS_down_stop_CDS))
overlaps_PSS_down_stop_CDS_ggplot$nt_pos <- as.numeric(row.names(overlaps_PSS_down_stop_CDS_ggplot))
names(overlaps_PSS_down_stop_CDS_ggplot)[1] <- "count"
p <- ggplot(overlaps_PSS_down_start_CDS_ggplot, aes(x=nt_pos, y=count)) + geom_line(size=0.5, color="#e69f00ff") + theme_light() + labs(x="Distance relative to start codon (nt)", y="Number of cleavage sites")+ geom_vline(aes(xintercept=0), linetype=2, color="darkgray")# + ylim(0,8)
p

ggsave("output/distance_startStop/PSS_TV_count_relativeStartCodon.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(overlaps_PSS_down_stop_CDS_ggplot, aes(x=nt_pos, y=count)) + geom_line(size=0.5, color="#e69f00ff") + theme_light() + labs(x="Distance relative to stop codon (nt)", y="Number of cleavage sites")+ geom_vline(aes(xintercept=0), linetype=2, color="darkgray")# + ylim(0,8)
p

ggsave("output/distance_startStop/PSS_TV_count_relativeStopCodon.pdf", plot=p, width=10, height=7, units="cm")

4.7 Perform RNA Fold with sliding window approach

To investigate secondary structures around processing sites: Extract 150nt upstream / downstream of processing sites and calculate minimum free energy (\(\Delta\)G). Do this also for random peaks, start and end positions of TUs, start and end positions of CDS. Use sliding window approach to reduce problem of transcript boundaries.

4.7.1 Processing Sites

# export sequences from peaks + / - 150nt
peaks_WT_150 <- extract_RNA_sequences(PSS_Ranges_WT, syne_fasta, 150)
peaks_TV_150 <- extract_RNA_sequences(PSS_Ranges_TV, syne_fasta, 150)

writeXStringSet(peaks_WT_150, "output/RNAfold/peaks_WT_150.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(peaks_TV_150, "output/RNAfold/peaks_TV_150.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")

run from command line (!takes a long time!): python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/peaks_WT_150.fasta RNAfold/mfe_files/mfe_peaks_WT_150.tsv 50 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/peaks_TV_150.fasta RNAfold/mfe_files/mfe_peaks_TV_150.tsv 50

mfe_PSS_up <- read.delim("output/RNAfold/mfe_files/mfe_peaks_WT_150.tsv", header=TRUE, row.names=1)
mfe_PSS_up_control <- read.delim("output/RNAfold/mfe_files/mfe_peaks_WT_150_control.tsv", header=TRUE, row.names=1)
mfe_PSS_down <- read.delim("output/RNAfold/mfe_files/mfe_peaks_TV_150.tsv", header=TRUE, row.names=1)
mfe_PSS_down_control <- read.delim("output/RNAfold/mfe_files/mfe_peaks_TV_150_control.tsv", header=TRUE, row.names=1)

Original sequences represented nucleotide positions before (-150:-1) and after (+1:+150) cleavage position. With the floating window approach etc., this translates to position “150.5” (index 126).

df_mfe_PSS_quartiles <- data.frame(cbind(ntpos=c(rep(c(-125:125),2)), mfe_distrib=c(apply(mfe_PSS_up, 1, median), apply(mfe_PSS_down, 1, median),apply(mfe_PSS_up, 1, quantile,probs=0.25), apply(mfe_PSS_down, 1, quantile,probs=0.25),apply(mfe_PSS_up, 1,  quantile,probs=0.75), apply(mfe_PSS_down, 1,  quantile,probs=0.75))))
df_mfe_PSS_quartiles$updown <- rep(c(rep("up",251), rep("down", 251)),3)
df_mfe_PSS_quartiles$median_quartile <- c(rep("median", 502), rep("1quartile", 502), rep("3quartile",502))

mean_or_median = mean
df_mfe_PSS <- data.frame(cbind(ntpos=c(rep(c(-125:125),4)), mfe=c(apply(mfe_PSS_up, 1, mean_or_median), apply(mfe_PSS_down, 1, mean_or_median), apply(mfe_PSS_up_control, 1, mean_or_median), apply(mfe_PSS_down_control,1, mean_or_median))))
df_mfe_PSS$updown <- rep(c(rep("up",251), rep("down", 251)),2)
df_mfe_PSS$control <- c(rep("data", 502), rep("control", 502))

Plot with shuffled data and non-shuffled data

p <- ggplot(data=df_mfe_PSS, aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(aes(linetype=control), size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)5ps", "rne(5p)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-10,-4.75)
p

ggsave("output/RNAfold/figures/deltaG_PSS_shuffled.pdf", plot=p, width=10, height=7, units="cm")

Plot with quartiles (1st, median, 3rd)

p <- ggplot(data=df_mfe_PSS_quartiles, aes(x=ntpos, y=mfe_distrib, color=updown, linetype=median_quartile)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + 
  scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>5p", "rne(5p)>WT")) + 
  scale_linetype_manual(name="Quartiles",values=c(3,1,3), breaks=c("1quartile", "median", "3quartile"), labels=c("1st", "2nd", "3rd")) +
  xlab("Distance Relative to Cleavage Site (nt)") + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-14,-2)
p

ggsave("output/RNAfold/figures/deltaG_PSS_quartiles.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(data=subset(df_mfe_PSS, df_mfe_PSS$control=="data"), aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>5p", "rne(5p)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-10,-4.75)
p

ggsave("output/RNAfold/figures/deltaG_PSS.pdf", plot=p, width=10, height=7, units="cm")

run from command line (!takes a long time!): python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/peaks_WT_150.fasta RNAfold/mfe_files/mfe_peaks_WT_150_25nt.tsv 25 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/peaks_TV_150.fasta RNAfold/mfe_files/mfe_peaks_TV_150_25nt.tsv 25

mfe_PSS_25nt_up <- read.delim("output/RNAfold/mfe_files/mfe_peaks_WT_150_25nt.tsv", header=TRUE, row.names=1)
mfe_PSS_25nt_down <- read.delim("output/RNAfold/mfe_files/mfe_peaks_TV_150_25nt.tsv", header=TRUE, row.names=1)
mfe_PSS_25nt_up_control <- read.delim("output/RNAfold/mfe_files/mfe_peaks_WT_150_25nt_control.tsv", header=TRUE, row.names=1)
mfe_PSS_25nt_down_control <- read.delim("output/RNAfold/mfe_files/mfe_peaks_TV_150_25nt_control.tsv", header=TRUE, row.names=1)

Processing site at position 150.5. When x-axis is labeled from -137.5:137.5, this corresponds to -0

df_mfe_PSS_25nt_quartiles <- data.frame(cbind(ntpos=c(rep(c(-137.5:137.5),2)), mfe_distrib=c(apply(mfe_PSS_25nt_up, 1, median), apply(mfe_PSS_25nt_down, 1, median),apply(mfe_PSS_25nt_up, 1, quantile,probs=0.25), apply(mfe_PSS_25nt_down, 1, quantile,probs=0.25),apply(mfe_PSS_25nt_up, 1,  quantile,probs=0.75), apply(mfe_PSS_25nt_down, 1,  quantile,probs=0.75))))
df_mfe_PSS_25nt_quartiles$updown <- rep(c(rep("up",276), rep("down", 276)),3)
df_mfe_PSS_25nt_quartiles$median_quartile <- c(rep("median", 552), rep("1quartile", 552), rep("3quartile",552))

mean_or_median = mean
df_mfe_PSS_25nt <- data.frame(cbind(ntpos=c(rep(c(-137.5:137.5),4)), mfe=c(apply(mfe_PSS_25nt_up, 1, mean_or_median), apply(mfe_PSS_25nt_down, 1, mean_or_median), apply(mfe_PSS_25nt_up_control, 1, mean_or_median), apply(mfe_PSS_25nt_down_control,1, mean_or_median))))
df_mfe_PSS_25nt$updown <- rep(c(rep("up",276), rep("down", 276)),2)
df_mfe_PSS_25nt$control <- c(rep("data", 552), rep("control", 552))
p <- ggplot(data=df_mfe_PSS_25nt, aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(aes(linetype=control), size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)", "rne(5p)")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(-3.4,-1)
p

ggsave("output/RNAfold/figures/deltaG_PSS_25nt_shuffled.pdf", plot=p, width=10, height=7, units="cm")

p <- ggplot(data=df_mfe_PSS_25nt_quartiles, aes(x=ntpos, y=mfe_distrib, color=updown, linetype=median_quartile)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + 
  scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)", "rne(5p)")) + 
  scale_linetype_manual(name="Quartiles",values=c(3,1,3), breaks=c("1quartile", "median", "3quartile"), labels=c("1st", "2nd", "3rd")) +
  xlab("Distance Relative to Cleavage Site (nt)") + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-5.1,0.1)
p

ggsave("output/RNAfold/figures/deltaG_PSS_25nt_quartiles.pdf", plot=p, width=10, height=7, units="cm")

p <- ggplot(data=subset(df_mfe_PSS_25nt, df_mfe_PSS_25nt$control=="data"), aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)", "rne(5p)")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(-3.4,-1)
p

ggsave("output/RNAfold/figures/deltaG_PSS_25nt.pdf", plot=p, width=10, height=7, units="cm")

4.7.2 Random Sites

# use same amount of random positions
shuffle_Ranges_perPlasmid <- function(Ranges_object, sequence_object, seed=NULL){
  set.seed(seed)

  # extract lengths of sequences
  lengths <- c()
  for(i in 1:length(sequence_object)){
    lengths[i] <- width(sequence_object[i])
  }
  names(lengths) <- names(sequence_object)

  # initialise new_ranges, end has to be set according to length of plasmids / chromosome
  new_ranges <- Ranges_object
  start(new_ranges) <- rep(1, length(new_ranges))
  
  # do actual shuffling  
  for(seq_i in 1:length(lengths)){
    seq <- names(lengths)[seq_i]
    indices <- which(seqnames(new_ranges)==seq)
    # set ends to max possible length to avoid width < 0
    end(new_ranges[indices]) <- rep(lengths[seq], length(new_ranges[indices]))
    
    # create random numbers
    start(new_ranges[indices]) <- round(runif(length(new_ranges[indices]), 1, lengths[seq]))
    end(new_ranges[indices]) <- start(new_ranges[indices])
  }
  
  return(new_ranges)
}

random_up_Ranges <- shuffle_Ranges_perPlasmid(PSS_Ranges_WT, syne_fasta, 42)
random_down_Ranges <- shuffle_Ranges_perPlasmid(PSS_Ranges_TV, syne_fasta, 42)

random_up_150 <- extract_RNA_sequences(random_up_Ranges, syne_fasta, 150)
random_down_150 <- extract_RNA_sequences(random_down_Ranges, syne_fasta, 150)

writeXStringSet(random_up_150, "output/RNAfold/random_up_150.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(random_down_150, "output/RNAfold/random_down_150.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")

run from command line (!takes a long time!): python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/random_up_150.fasta RNAfold/mfe_files/mfe_random_up_150.tsv 50 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/random_down_150.fasta RNAfold/mfe_files/mfe_random_down_150.tsv 50

mfe_random_up <- read.delim("output/RNAfold/mfe_files/mfe_random_up_150.tsv", header=TRUE, row.names=1)
mfe_random_up_control <- read.delim("output/RNAfold/mfe_files/mfe_random_up_150_control.tsv", header=TRUE, row.names=1)
mfe_random_down <- read.delim("output/RNAfold/mfe_files/mfe_random_down_150.tsv", header=TRUE, row.names=1)
mfe_random_down_control <- read.delim("output/RNAfold/mfe_files/mfe_random_down_150_control.tsv", header=TRUE, row.names=1)
df_mfe_random_quartiles <- data.frame(cbind(ntpos=c(rep(c(-125:125),2)), mfe_distrib=c(apply(mfe_random_up, 1, median), apply(mfe_random_down, 1, median),apply(mfe_random_up, 1, quantile,probs=0.25), apply(mfe_random_down, 1, quantile,probs=0.25),apply(mfe_random_up, 1,  quantile,probs=0.75), apply(mfe_random_down, 1,  quantile,probs=0.75))))
df_mfe_random_quartiles$updown <- rep(c(rep("up",251), rep("down", 251)),3)
df_mfe_random_quartiles$median_quartile <- c(rep("median", 502), rep("1quartile", 502), rep("3quartile",502))

mean_or_median = mean
df_mfe_random <- data.frame(cbind(ntpos=c(rep(c(-125:125),4)), mfe=c(apply(mfe_random_up, 1, mean_or_median), apply(mfe_random_down, 1, mean_or_median), apply(mfe_random_up_control, 1, mean_or_median), apply(mfe_random_down_control,1, mean_or_median))))
df_mfe_random$updown <- rep(c(rep("up",251), rep("down", 251)),2)
df_mfe_random$control <- c(rep("data", 502), rep("control", 502))

Plot with shuffled data and non-shuffled data

p <- ggplot(data=df_mfe_random, aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(aes(linetype=control), size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(5p)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-10,-4.75)
p

ggsave("output/RNAfold/figures/deltaG_random_shuffled.pdf", plot=p, width=10, height=7, units="cm")

Plot with quartiles (1st, median, 3rd)

p <- ggplot(data=df_mfe_random_quartiles, aes(x=ntpos, y=mfe_distrib, color=updown, linetype=median_quartile)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + 
  scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(5p)>WT")) + 
  scale_linetype_manual(name="Quartiles",values=c(3,1,3), breaks=c("1quartile", "median", "3quartile"), labels=c("1st", "2nd", "3rd")) +
  xlab("Distance Relative to Cleavage Site (nt)") + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-14,-2)
p

ggsave("output/RNAfold/figures/deltaG_random_quartiles.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(data=subset(df_mfe_random, df_mfe_PSS$control=="data"), aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(5p)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-10,-4.75)
p

ggsave("output/RNAfold/figures/deltaG_random.pdf", plot=p, width=10, height=7, units="cm")

run from command line (!takes a long time!): python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/random_up_150.fasta RNAfold/mfe_files/mfe_random_up_150_25nt.tsv 25 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/random_down_150.fasta RNAfold/mfe_files/mfe_random_down_150_25nt.tsv 25

mfe_random_25nt_up <- read.delim("output/RNAfold/mfe_files/mfe_random_up_150_25nt.tsv", header=TRUE, row.names=1)
mfe_random_25nt_down <- read.delim("output/RNAfold/mfe_files/mfe_random_down_150_25nt.tsv", header=TRUE, row.names=1)
mfe_random_25nt_up_control <- read.delim("output/RNAfold/mfe_files/mfe_random_up_150_25nt_control.tsv", header=TRUE, row.names=1)
mfe_random_25nt_down_control <- read.delim("output/RNAfold/mfe_files/mfe_random_down_150_25nt_control.tsv", header=TRUE, row.names=1)

Processing site at position 150.5. When x-axis is labeled from -137.5:137.5, this corresponds to -0

df_mfe_random_25nt_quartiles <- data.frame(cbind(ntpos=c(rep(c(-137.5:137.5),2)), mfe_distrib=c(apply(mfe_random_25nt_up, 1, median), apply(mfe_random_25nt_down, 1, median),apply(mfe_random_25nt_up, 1, quantile,probs=0.25), apply(mfe_random_25nt_down, 1, quantile,probs=0.25),apply(mfe_random_25nt_up, 1,  quantile,probs=0.75), apply(mfe_random_25nt_down, 1,  quantile,probs=0.75))))
df_mfe_random_25nt_quartiles$updown <- rep(c(rep("up",276), rep("down", 276)),3)
df_mfe_random_25nt_quartiles$median_quartile <- c(rep("median", 552), rep("1quartile", 552), rep("3quartile",552))

mean_or_median = mean
df_mfe_random_25nt <- data.frame(cbind(ntpos=c(rep(c(-137.5:137.5),4)), mfe=c(apply(mfe_random_25nt_up, 1, mean_or_median), apply(mfe_random_25nt_down, 1, mean_or_median), apply(mfe_random_25nt_up_control, 1, mean_or_median), apply(mfe_random_25nt_down_control,1, mean_or_median))))
df_mfe_random_25nt$updown <- rep(c(rep("up",276), rep("down", 276)),2)
df_mfe_random_25nt$control <- c(rep("data", 552), rep("control", 552))
p <- ggplot(data=df_mfe_random_25nt, aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(aes(linetype=control), size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(5p)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(-3.4,-1)
p

ggsave("output/RNAfold/figures/deltaG_random_25nt_shuffled.pdf", plot=p, width=10, height=7, units="cm")

p <- ggplot(data=df_mfe_random_25nt_quartiles, aes(x=ntpos, y=mfe_distrib, color=updown, linetype=median_quartile)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + 
  scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(5p)>WT")) + 
  scale_linetype_manual(name="Quartiles",values=c(3,1,3), breaks=c("1quartile", "median", "3quartile"), labels=c("1st", "2nd", "3rd")) +
  xlab("Distance Relative to Cleavage Site (nt)") + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-5.1,0.1)
p

ggsave("output/RNAfold/figures/deltaG_random_25nt_quartiles.pdf", plot=p, width=10, height=7, units="cm")

p <- ggplot(data=subset(df_mfe_random_25nt, df_mfe_random_25nt$control=="data"), aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(5p)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(-3.4,-1)
p

ggsave("output/RNAfold/figures/deltaG_random_25nt.pdf", plot=p, width=10, height=7, units="cm")

4.8 For AU content, ACGU content

Check for nucleotide content around processing sites, compare between different sets of processing sites.

length_up_down <- 50
peaks_all <- extract_RNA_sequences(PSS_Ranges, syne_fasta, length_up_down)
peaks_up <- extract_RNA_sequences(PSS_Ranges_WT, syne_fasta, length_up_down)
peaks_down <- extract_RNA_sequences(PSS_Ranges_TV, syne_fasta, length_up_down)

cons_mat <- consensusMatrix(peaks_up)
perc_AU_up <- (cons_mat[1,]+cons_mat[4,])/apply(cons_mat[1:4,], 2,sum)
df_perc_ACGU_up <- data.frame(cons_mat[1:4,])/sum(cons_mat[,1])*100
df_perc_ACGU_up <- data.frame(cbind(ntpos=c(-length_up_down:-1,1:length_up_down), t(df_perc_ACGU_up)))
df_perc_AU_up <- df_perc_ACGU_up[,c(1,2,5)]
df_perc_CG_up <- df_perc_ACGU_up[,c(1,3,4)]
df_perc_ACGU_up <- pivot_longer(df_perc_ACGU_up, 2:5)
df_perc_AU_up <- pivot_longer(df_perc_AU_up, 2:3)
df_perc_CG_up <- pivot_longer(df_perc_CG_up, 2:3)

cons_mat <- consensusMatrix(peaks_down)
perc_AU_down <- (cons_mat[1,]+cons_mat[4,])/apply(cons_mat[1:4,], 2,sum)
df_perc_ACGU_down <- data.frame(cons_mat[1:4,])/sum(cons_mat[,1])*100
df_perc_ACGU_down <- data.frame(cbind(ntpos=c(-length_up_down:-1,1:length_up_down), t(df_perc_ACGU_down)))
df_perc_AU_down <- df_perc_ACGU_down[,c(1,2,5)]
df_perc_CG_down <- df_perc_ACGU_down[,c(1,3,4)]
df_perc_ACGU_down <- pivot_longer(df_perc_ACGU_down, 2:5)
df_perc_AU_down <- pivot_longer(df_perc_AU_down, 2:3)
df_perc_CG_down <- pivot_longer(df_perc_CG_down, 2:3)

df_percAU <- data.frame(cbind(ntpos=rep(c(-length_up_down:-1,1:length_up_down),2),percAU=c(perc_AU_up, perc_AU_down)*100))
df_percAU$updown <- c(rep("up",2*length_up_down), rep("down", 2*length_up_down))

general_AU <- (sum(oligonucleotideFrequency(syne_fasta, width=1)[,c(1,4)])/sum(oligonucleotideFrequency(syne_fasta, width=1)))*100

ACGU_colors <- palette_OkabeIto[c(3,1,5,6)]
names(ACGU_colors) <- c("U", "G", "C", "A")

Compare AU content between peak positions which are either up- or downregulated

p <- ggplot(data=df_percAU, aes(x=ntpos, y=percAU, color=updown)) + geom_hline(aes(yintercept=general_AU), color="darkgray", linetype=2) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=.5) + theme_light() + 
  scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks",
                           breaks=c("up", "down"),
                           labels=c("rne(WT)>rne(5p)", "rne(5p)>rne(WT)")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab("Percentage AU (%)")
  
p <- p + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5))

ggsave("output/nt_content_plots/AUcontent.pdf", plot=p, width=10, height=7, units="cm")

Code for disinguishing nts:

p <- ggplot(data=df_perc_ACGU_up, aes(x=ntpos, y=value, color=name)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_color_manual(name="nt", values=ACGU_colors)+ xlab("Distance Relative to Cleavage Site (nt)") + ylab("Percentage ACGU (%)") + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(5,72)+xlim(-25,+25)
p
## Warning: Removed 200 row(s) containing missing values (geom_path).

ggsave("output/nt_content_plots/ACGU_up_content.pdf", plot=p, width=10, height=7, units="cm")
## Warning: Removed 200 row(s) containing missing values (geom_path).
p <- ggplot(data=df_perc_ACGU_down, aes(x=ntpos, y=value, color=name)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=.5) + theme_light() + scale_color_manual(name="nt", values=ACGU_colors)+ xlab("Distance Relative to Cleavage Site (nt)") + ylab("Percentage ACGU (%)") + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5))+ylim(5,72)+xlim(-25,+25)
p
## Warning: Removed 200 row(s) containing missing values (geom_path).

ggsave("output/nt_content_plots/ACGU_down_content.pdf", plot=p, width=10, height=7, units="cm")
## Warning: Removed 200 row(s) containing missing values (geom_path).

Session info

## R version 4.0.4 (2021-02-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 11 (bullseye)
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
## 
## locale:
##  [1] LC_CTYPE=de_DE.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=de_DE.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=de_DE.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] clusterProfiler_3.18.1      colorblindr_0.1.0          
##  [3] colorspace_2.0-0            viridis_0.5.1              
##  [5] viridisLite_0.3.0           rtracklayer_1.50.0         
##  [7] Biostrings_2.58.0           XVector_0.30.0             
##  [9] vsn_3.58.0                  RColorBrewer_1.1-2         
## [11] pheatmap_1.0.12             ggpubr_0.4.0               
## [13] ggrepel_0.9.1               edgeR_3.32.1               
## [15] limma_3.46.0                DESeq2_1.30.1              
## [17] SummarizedExperiment_1.20.0 Biobase_2.50.0             
## [19] MatrixGenerics_1.2.1        matrixStats_0.58.0         
## [21] GenomicRanges_1.42.0        GenomeInfoDb_1.26.2        
## [23] IRanges_2.24.1              S4Vectors_0.28.1           
## [25] BiocGenerics_0.36.0         forcats_0.5.1              
## [27] stringr_1.4.0               dplyr_1.0.5                
## [29] purrr_0.3.4                 readr_1.4.0                
## [31] tidyr_1.1.3                 tibble_3.1.0               
## [33] ggplot2_3.3.3               tidyverse_1.3.0            
## [35] knitr_1.31                 
## 
## loaded via a namespace (and not attached):
##   [1] shadowtext_0.0.7         readxl_1.3.1             backports_1.2.1         
##   [4] fastmatch_1.1-0          igraph_1.2.6             plyr_1.8.6              
##   [7] splines_4.0.4            BiocParallel_1.24.1      digest_0.6.27           
##  [10] htmltools_0.5.1.1        GOSemSim_2.16.1          GO.db_3.12.1            
##  [13] fansi_0.4.2              magrittr_2.0.1           memoise_2.0.0           
##  [16] openxlsx_4.2.3           graphlayouts_0.7.1       annotate_1.68.0         
##  [19] modelr_0.1.8             enrichplot_1.10.2        blob_1.2.1              
##  [22] rvest_1.0.0              haven_2.3.1              xfun_0.22               
##  [25] crayon_1.4.1             RCurl_1.98-1.2           jsonlite_1.7.2          
##  [28] scatterpie_0.1.5         genefilter_1.72.1        survival_3.2-7          
##  [31] glue_1.4.2               polyclip_1.10-0          gtable_0.3.0            
##  [34] zlibbioc_1.36.0          DelayedArray_0.16.2      car_3.0-10              
##  [37] abind_1.4-5              scales_1.1.1             DOSE_3.16.0             
##  [40] DBI_1.1.1                rstatix_0.7.0            Rcpp_1.0.6              
##  [43] xtable_1.8-4             foreign_0.8-81           bit_4.0.4               
##  [46] preprocessCore_1.52.1    httr_1.4.2               fgsea_1.16.0            
##  [49] ellipsis_0.3.1           farver_2.1.0             pkgconfig_2.0.3         
##  [52] XML_3.99-0.5             sass_0.3.1               dbplyr_2.1.0            
##  [55] locfit_1.5-9.4           utf8_1.1.4               labeling_0.4.2          
##  [58] tidyselect_1.1.0         rlang_0.4.10             reshape2_1.4.4          
##  [61] AnnotationDbi_1.52.0     munsell_0.5.0            cellranger_1.1.0        
##  [64] tools_4.0.4              cachem_1.0.4             downloader_0.4          
##  [67] cli_2.3.1                generics_0.1.0           RSQLite_2.2.3           
##  [70] broom_0.7.5              evaluate_0.14            fastmap_1.1.0           
##  [73] yaml_2.2.1               bit64_4.0.5              fs_1.5.0                
##  [76] tidygraph_1.2.0          zip_2.1.1                ggraph_2.0.5            
##  [79] DO.db_2.9                xml2_1.3.2               compiler_4.0.4          
##  [82] rstudioapi_0.13          curl_4.3                 affyio_1.60.0           
##  [85] ggsignif_0.6.1           reprex_1.0.0             tweenr_1.0.1            
##  [88] geneplotter_1.68.0       bslib_0.2.4              stringi_1.5.3           
##  [91] highr_0.8                lattice_0.20-41          Matrix_1.3-2            
##  [94] vctrs_0.3.6              pillar_1.5.1             lifecycle_1.0.0         
##  [97] BiocManager_1.30.10      jquerylib_0.1.3          cowplot_1.1.1           
## [100] data.table_1.14.0        bitops_1.0-6             qvalue_2.22.0           
## [103] R6_2.5.0                 affy_1.68.0              gridExtra_2.3           
## [106] rio_0.5.26               MASS_7.3-53.1            assertthat_0.2.1        
## [109] withr_2.4.1              GenomicAlignments_1.26.0 Rsamtools_2.6.0         
## [112] GenomeInfoDbData_1.2.4   hms_1.0.0                grid_4.0.4              
## [115] rvcheck_0.1.8            rmarkdown_2.7            carData_3.0-4           
## [118] ggforce_0.3.3            lubridate_1.7.10

References

Innocenti, Nicolas, Monica Golumbeanu, Aymeric Fouquier d’Hérouël, Caroline Lacoux, Rémy A. Bonnin, Sean P. Kennedy, Françoise Wessner, et al. 2015. “Whole-Genome Mapping of 5’ Rna Ends in Bacteria by Tagged Sequencing: A Comprehensive View in Enterococcus Faecalis.” RNA 21 (5): 1018–30. https://doi.org/10.1261/rna.048470.114.

Kopf, M., S. Klähn, I. Scholz, J. K. F. Matthiessen, W. R. Hess, and B. Voss. 2014. “Comparative Analysis of the Primary Transcriptome of Synechocystis Sp. PCC 6803.” DNA Research 21 (5): 527–39. https://doi.org/10.1093/dnares/dsu018.

Sakurai, Isamu, Damir Stazic, Marion Eisenhut, Eerika Vuorio, Claudia Steglich, Wolfgang R. Hess, and Eva-Mari Aro. 2012. “Positive Regulation of psbA Gene Expression by Cis-Encoded Antisense RNAs in Synechocystis Sp. PCC 6803.” Plant Physiology 160 (2): 1000–1010. https://doi.org/10.1104/pp.112.202127.